LLVM 20.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
67#include "llvm/IR/Attributes.h"
68#include "llvm/IR/CallingConv.h"
69#include "llvm/IR/Constant.h"
70#include "llvm/IR/Constants.h"
71#include "llvm/IR/DataLayout.h"
72#include "llvm/IR/DebugLoc.h"
74#include "llvm/IR/Function.h"
75#include "llvm/IR/GlobalAlias.h"
76#include "llvm/IR/GlobalValue.h"
78#include "llvm/IR/IRBuilder.h"
79#include "llvm/IR/InlineAsm.h"
80#include "llvm/IR/Instruction.h"
83#include "llvm/IR/Intrinsics.h"
84#include "llvm/IR/IntrinsicsARM.h"
85#include "llvm/IR/Module.h"
87#include "llvm/IR/Type.h"
88#include "llvm/IR/User.h"
89#include "llvm/IR/Value.h"
90#include "llvm/MC/MCInstrDesc.h"
93#include "llvm/MC/MCSchedule.h"
100#include "llvm/Support/Debug.h"
108#include <algorithm>
109#include <cassert>
110#include <cstdint>
111#include <cstdlib>
112#include <iterator>
113#include <limits>
114#include <optional>
115#include <tuple>
116#include <utility>
117#include <vector>
118
119using namespace llvm;
120using namespace llvm::PatternMatch;
121
122#define DEBUG_TYPE "arm-isel"
123
124STATISTIC(NumTailCalls, "Number of tail calls");
125STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
126STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
127STATISTIC(NumConstpoolPromoted,
128 "Number of constants with their storage promoted into constant pools");
129
130static cl::opt<bool>
131ARMInterworking("arm-interworking", cl::Hidden,
132 cl::desc("Enable / disable ARM interworking (for debugging only)"),
133 cl::init(true));
134
136 "arm-promote-constant", cl::Hidden,
137 cl::desc("Enable / disable promotion of unnamed_addr constants into "
138 "constant pools"),
139 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
141 "arm-promote-constant-max-size", cl::Hidden,
142 cl::desc("Maximum size of constant to promote into a constant pool"),
143 cl::init(64));
145 "arm-promote-constant-max-total", cl::Hidden,
146 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
147 cl::init(128));
148
150MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
151 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
152 cl::init(2));
153
154// The APCS parameter registers.
155static const MCPhysReg GPRArgRegs[] = {
156 ARM::R0, ARM::R1, ARM::R2, ARM::R3
157};
158
160 SelectionDAG &DAG, const SDLoc &DL) {
162 assert(Arg.ArgVT.bitsLT(MVT::i32));
163 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
164 SDValue Ext =
166 MVT::i32, Trunc);
167 return Ext;
168}
169
170void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
171 if (VT != PromotedLdStVT) {
172 setOperationAction(ISD::LOAD, VT, Promote);
173 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
174
175 setOperationAction(ISD::STORE, VT, Promote);
176 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
177 }
178
179 MVT ElemTy = VT.getVectorElementType();
180 if (ElemTy != MVT::f64)
184 if (ElemTy == MVT::i32) {
189 } else {
194 }
203 if (VT.isInteger()) {
207 }
208
209 // Neon does not support vector divide/remainder operations.
218
219 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
220 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
222 setOperationAction(Opcode, VT, Legal);
223 if (!VT.isFloatingPoint())
224 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
225 setOperationAction(Opcode, VT, Legal);
226}
227
228void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
229 addRegisterClass(VT, &ARM::DPRRegClass);
230 addTypeForNEON(VT, MVT::f64);
231}
232
233void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
234 addRegisterClass(VT, &ARM::DPairRegClass);
235 addTypeForNEON(VT, MVT::v2f64);
236}
237
238void ARMTargetLowering::setAllExpand(MVT VT) {
239 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
240 setOperationAction(Opc, VT, Expand);
241
242 // We support these really simple operations even on types where all
243 // the actual arithmetic has to be broken down into simpler
244 // operations or turned into library calls.
245 setOperationAction(ISD::BITCAST, VT, Legal);
246 setOperationAction(ISD::LOAD, VT, Legal);
247 setOperationAction(ISD::STORE, VT, Legal);
249}
250
251void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
252 LegalizeAction Action) {
253 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
254 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
255 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
256}
257
258void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
259 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
260
261 for (auto VT : IntTypes) {
262 addRegisterClass(VT, &ARM::MQPRRegClass);
276 setOperationAction(ISD::MLOAD, VT, Custom);
277 setOperationAction(ISD::MSTORE, VT, Legal);
292
293 // No native support for these.
303
304 // Vector reductions
305 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
306 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
307 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
308 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
309 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
310 setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
311 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
312 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
313 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
314
315 if (!HasMVEFP) {
320 } else {
323 }
324
325 // Pre and Post inc are supported on loads and stores
326 for (unsigned im = (unsigned)ISD::PRE_INC;
332 }
333 }
334
335 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
336 for (auto VT : FloatTypes) {
337 addRegisterClass(VT, &ARM::MQPRRegClass);
338 if (!HasMVEFP)
339 setAllExpand(VT);
340
341 // These are legal or custom whether we have MVE.fp or not
350 setOperationAction(ISD::MLOAD, VT, Custom);
351 setOperationAction(ISD::MSTORE, VT, Legal);
354
355 // Pre and Post inc are supported on loads and stores
356 for (unsigned im = (unsigned)ISD::PRE_INC;
362 }
363
364 if (HasMVEFP) {
365 setOperationAction(ISD::FMINNUM, VT, Legal);
366 setOperationAction(ISD::FMAXNUM, VT, Legal);
367 setOperationAction(ISD::FROUND, VT, Legal);
368 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
369 setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
370 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
371 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
372
373 // No native support for these.
376 setOperationAction(ISD::FSQRT, VT, Expand);
377 setOperationAction(ISD::FSIN, VT, Expand);
378 setOperationAction(ISD::FCOS, VT, Expand);
379 setOperationAction(ISD::FTAN, VT, Expand);
380 setOperationAction(ISD::FPOW, VT, Expand);
381 setOperationAction(ISD::FLOG, VT, Expand);
382 setOperationAction(ISD::FLOG2, VT, Expand);
383 setOperationAction(ISD::FLOG10, VT, Expand);
384 setOperationAction(ISD::FEXP, VT, Expand);
385 setOperationAction(ISD::FEXP2, VT, Expand);
386 setOperationAction(ISD::FEXP10, VT, Expand);
387 setOperationAction(ISD::FNEARBYINT, VT, Expand);
388 }
389 }
390
391 // Custom Expand smaller than legal vector reductions to prevent false zero
392 // items being added.
393 setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
394 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
395 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
396 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
397 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
398 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
399 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
400 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
401
402 // We 'support' these types up to bitcast/load/store level, regardless of
403 // MVE integer-only / float support. Only doing FP data processing on the FP
404 // vector types is inhibited at integer-only level.
405 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
406 for (auto VT : LongTypes) {
407 addRegisterClass(VT, &ARM::MQPRRegClass);
408 setAllExpand(VT);
414 }
416
417 // We can do bitwise operations on v2i64 vectors
418 setOperationAction(ISD::AND, MVT::v2i64, Legal);
419 setOperationAction(ISD::OR, MVT::v2i64, Legal);
420 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
421
422 // It is legal to extload from v4i8 to v4i16 or v4i32.
423 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
424 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
425 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
426
427 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
433
434 // Some truncating stores are legal too.
435 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
436 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
437 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
438
439 // Pre and Post inc on these are legal, given the correct extends
440 for (unsigned im = (unsigned)ISD::PRE_INC;
442 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
447 }
448 }
449
450 // Predicate types
451 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
452 for (auto VT : pTypes) {
453 addRegisterClass(VT, &ARM::VCCRRegClass);
462 setOperationAction(ISD::LOAD, VT, Custom);
463 setOperationAction(ISD::STORE, VT, Custom);
468
469 if (!HasMVEFP) {
474 }
475 }
479 setOperationAction(ISD::OR, MVT::v2i1, Expand);
485
494}
495
497 const ARMSubtarget &STI)
498 : TargetLowering(TM), Subtarget(&STI) {
499 RegInfo = Subtarget->getRegisterInfo();
500 Itins = Subtarget->getInstrItineraryData();
501
504
505 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
506 !Subtarget->isTargetWatchOS() && !Subtarget->isTargetDriverKit()) {
507 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
508 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
509 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
510 IsHFTarget ? CallingConv::ARM_AAPCS_VFP
512 }
513
514 if (Subtarget->isTargetMachO()) {
515 // Uses VFP for Thumb libfuncs if available.
516 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
517 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
518 static const struct {
519 const RTLIB::Libcall Op;
520 const char * const Name;
521 const ISD::CondCode Cond;
522 } LibraryCalls[] = {
523 // Single-precision floating-point arithmetic.
524 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
525 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
526 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
527 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
528
529 // Double-precision floating-point arithmetic.
530 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
531 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
532 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
533 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
534
535 // Single-precision comparisons.
536 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE },
537 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE },
538 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE },
539 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE },
540 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
541 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
542 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
543
544 // Double-precision comparisons.
545 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
546 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE },
547 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE },
548 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE },
549 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
550 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
551 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
552
553 // Floating-point to integer conversions.
554 // i64 conversions are done via library routines even when generating VFP
555 // instructions, so use the same ones.
556 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID },
557 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
558 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID },
559 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
560
561 // Conversions between floating types.
562 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID },
563 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID },
564
565 // Integer to floating-point conversions.
566 // i64 conversions are done via library routines even when generating VFP
567 // instructions, so use the same ones.
568 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
569 // e.g., __floatunsidf vs. __floatunssidfvfp.
570 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID },
571 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
572 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID },
573 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
574 };
575
576 for (const auto &LC : LibraryCalls) {
577 setLibcallName(LC.Op, LC.Name);
578 if (LC.Cond != ISD::SETCC_INVALID)
579 setCmpLibcallCC(LC.Op, LC.Cond);
580 }
581 }
582 }
583
584 // RTLIB
585 if (Subtarget->isAAPCS_ABI() &&
586 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
587 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
588 static const struct {
589 const RTLIB::Libcall Op;
590 const char * const Name;
591 const CallingConv::ID CC;
592 const ISD::CondCode Cond;
593 } LibraryCalls[] = {
594 // Double-precision floating-point arithmetic helper functions
595 // RTABI chapter 4.1.2, Table 2
596 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
597 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
598 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
599 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
600
601 // Double-precision floating-point comparison helper functions
602 // RTABI chapter 4.1.2, Table 3
603 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
604 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
605 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
606 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
607 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
608 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
609 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
610
611 // Single-precision floating-point arithmetic helper functions
612 // RTABI chapter 4.1.2, Table 4
613 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
614 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
615 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
616 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
617
618 // Single-precision floating-point comparison helper functions
619 // RTABI chapter 4.1.2, Table 5
620 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
621 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
622 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
623 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
624 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
625 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
626 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
627
628 // Floating-point to integer conversions.
629 // RTABI chapter 4.1.2, Table 6
630 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
631 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
632 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
633 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
634 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
635 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
636 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
637 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
638
639 // Conversions between floating types.
640 // RTABI chapter 4.1.2, Table 7
641 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
642 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
643 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
644
645 // Integer to floating-point conversions.
646 // RTABI chapter 4.1.2, Table 8
647 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
648 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
649 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
650 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
651 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
652 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
653 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
654 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
655
656 // Long long helper functions
657 // RTABI chapter 4.2, Table 9
658 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
659 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
660 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
661 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
662
663 // Integer division functions
664 // RTABI chapter 4.3.1
665 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
666 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
667 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
668 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
669 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
670 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
671 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
672 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
673 };
674
675 for (const auto &LC : LibraryCalls) {
676 setLibcallName(LC.Op, LC.Name);
677 setLibcallCallingConv(LC.Op, LC.CC);
678 if (LC.Cond != ISD::SETCC_INVALID)
679 setCmpLibcallCC(LC.Op, LC.Cond);
680 }
681
682 // EABI dependent RTLIB
683 if (TM.Options.EABIVersion == EABI::EABI4 ||
684 TM.Options.EABIVersion == EABI::EABI5) {
685 static const struct {
686 const RTLIB::Libcall Op;
687 const char *const Name;
688 const CallingConv::ID CC;
689 const ISD::CondCode Cond;
690 } MemOpsLibraryCalls[] = {
691 // Memory operations
692 // RTABI chapter 4.3.4
693 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
694 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
695 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
696 };
697
698 for (const auto &LC : MemOpsLibraryCalls) {
699 setLibcallName(LC.Op, LC.Name);
700 setLibcallCallingConv(LC.Op, LC.CC);
701 if (LC.Cond != ISD::SETCC_INVALID)
702 setCmpLibcallCC(LC.Op, LC.Cond);
703 }
704 }
705 }
706
707 if (Subtarget->isTargetWindows()) {
708 static const struct {
709 const RTLIB::Libcall Op;
710 const char * const Name;
711 const CallingConv::ID CC;
712 } LibraryCalls[] = {
713 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
714 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
715 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
716 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
717 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
718 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
719 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
720 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
721 };
722
723 for (const auto &LC : LibraryCalls) {
724 setLibcallName(LC.Op, LC.Name);
725 setLibcallCallingConv(LC.Op, LC.CC);
726 }
727 }
728
729 // Use divmod compiler-rt calls for iOS 5.0 and later.
730 if (Subtarget->isTargetMachO() &&
731 !(Subtarget->isTargetIOS() &&
732 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
733 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
734 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
735 }
736
737 // The half <-> float conversion functions are always soft-float on
738 // non-watchos platforms, but are needed for some targets which use a
739 // hard-float calling convention by default.
740 if (!Subtarget->isTargetWatchABI()) {
741 if (Subtarget->isAAPCS_ABI()) {
742 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
743 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
744 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
745 } else {
746 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
747 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
748 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
749 }
750 }
751
752 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
753 // a __gnu_ prefix (which is the default).
754 if (Subtarget->isTargetAEABI()) {
755 static const struct {
756 const RTLIB::Libcall Op;
757 const char * const Name;
758 const CallingConv::ID CC;
759 } LibraryCalls[] = {
760 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
761 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
762 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
763 };
764
765 for (const auto &LC : LibraryCalls) {
766 setLibcallName(LC.Op, LC.Name);
767 setLibcallCallingConv(LC.Op, LC.CC);
768 }
769 }
770
771 if (Subtarget->isThumb1Only())
772 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
773 else
774 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
775
776 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
777 Subtarget->hasFPRegs()) {
778 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
779 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
780
785
786 if (!Subtarget->hasVFP2Base())
787 setAllExpand(MVT::f32);
788 if (!Subtarget->hasFP64())
789 setAllExpand(MVT::f64);
790 }
791
792 if (Subtarget->hasFullFP16()) {
793 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
794 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
795 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
796
797 setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
798 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
799 }
800
801 if (Subtarget->hasBF16()) {
802 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
803 setAllExpand(MVT::bf16);
804 if (!Subtarget->hasFullFP16())
805 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
806 }
807
809 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
810 setTruncStoreAction(VT, InnerVT, Expand);
811 addAllExtLoads(VT, InnerVT, Expand);
812 }
813
816
818 }
819
822
825
826 if (Subtarget->hasMVEIntegerOps())
827 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
828
829 // Combine low-overhead loop intrinsics so that we can lower i1 types.
830 if (Subtarget->hasLOB()) {
831 setTargetDAGCombine({ISD::BRCOND, ISD::BR_CC});
832 }
833
834 if (Subtarget->hasNEON()) {
835 addDRTypeForNEON(MVT::v2f32);
836 addDRTypeForNEON(MVT::v8i8);
837 addDRTypeForNEON(MVT::v4i16);
838 addDRTypeForNEON(MVT::v2i32);
839 addDRTypeForNEON(MVT::v1i64);
840
841 addQRTypeForNEON(MVT::v4f32);
842 addQRTypeForNEON(MVT::v2f64);
843 addQRTypeForNEON(MVT::v16i8);
844 addQRTypeForNEON(MVT::v8i16);
845 addQRTypeForNEON(MVT::v4i32);
846 addQRTypeForNEON(MVT::v2i64);
847
848 if (Subtarget->hasFullFP16()) {
849 addQRTypeForNEON(MVT::v8f16);
850 addDRTypeForNEON(MVT::v4f16);
851 }
852
853 if (Subtarget->hasBF16()) {
854 addQRTypeForNEON(MVT::v8bf16);
855 addDRTypeForNEON(MVT::v4bf16);
856 }
857 }
858
859 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
860 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
861 // none of Neon, MVE or VFP supports any arithmetic operations on it.
862 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
863 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
864 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
865 // FIXME: Code duplication: FDIV and FREM are expanded always, see
866 // ARMTargetLowering::addTypeForNEON method for details.
867 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
868 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
869 // FIXME: Create unittest.
870 // In another words, find a way when "copysign" appears in DAG with vector
871 // operands.
873 // FIXME: Code duplication: SETCC has custom operation action, see
874 // ARMTargetLowering::addTypeForNEON method for details.
876 // FIXME: Create unittest for FNEG and for FABS.
877 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
878 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
879 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
880 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
881 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
882 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
883 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
884 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
885 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
886 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
887 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
888 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
889 setOperationAction(ISD::FEXP10, MVT::v2f64, Expand);
890 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
891 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
892 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
893 setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
894 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
895 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
896 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
897 }
898
899 if (Subtarget->hasNEON()) {
900 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
901 // supported for v4f32.
902 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
903 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
904 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
905 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
906 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
907 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
908 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
909 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
910 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
911 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
912 setOperationAction(ISD::FEXP10, MVT::v4f32, Expand);
913 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
914 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
915 setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
916 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
917 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
918
919 // Mark v2f32 intrinsics.
920 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
921 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
922 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
923 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
924 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
925 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
926 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
927 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
928 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
929 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
930 setOperationAction(ISD::FEXP10, MVT::v2f32, Expand);
931 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
932 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
933 setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
934 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
935 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
936
937 // Neon does not support some operations on v1i64 and v2i64 types.
938 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
939 // Custom handling for some quad-vector types to detect VMULL.
940 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
941 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
942 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
943 // Custom handling for some vector types to avoid expensive expansions
944 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
946 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
948 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
949 // a destination type that is wider than the source, and nor does
950 // it have a FP_TO_[SU]INT instruction with a narrower destination than
951 // source.
960
962 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
963
964 // NEON does not have single instruction CTPOP for vectors with element
965 // types wider than 8-bits. However, custom lowering can leverage the
966 // v8i8/v16i8 vcnt instruction.
973
974 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
975 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
976
977 // NEON does not have single instruction CTTZ for vectors.
979 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
980 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
981 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
982
983 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
984 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
985 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
986 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
987
992
997
1001 }
1002
1003 // NEON only has FMA instructions as of VFP4.
1004 if (!Subtarget->hasVFP4Base()) {
1005 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
1006 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
1007 }
1008
1010 ISD::FP_TO_UINT, ISD::FMUL, ISD::LOAD});
1011
1012 // It is legal to extload from v4i8 to v4i16 or v4i32.
1013 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
1014 MVT::v2i32}) {
1019 }
1020 }
1021
1022 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1023 MVT::v4i32}) {
1024 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1025 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1026 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1027 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1028 }
1029 }
1030
1031 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
1037 ISD::INTRINSIC_VOID, ISD::VECREDUCE_ADD, ISD::ADD, ISD::BITCAST});
1038 }
1039 if (Subtarget->hasMVEIntegerOps()) {
1041 ISD::FP_EXTEND, ISD::SELECT, ISD::SELECT_CC,
1042 ISD::SETCC});
1043 }
1044 if (Subtarget->hasMVEFloatOps()) {
1046 }
1047
1048 if (!Subtarget->hasFP64()) {
1049 // When targeting a floating-point unit with only single-precision
1050 // operations, f64 is legal for the few double-precision instructions which
1051 // are present However, no double-precision operations other than moves,
1052 // loads and stores are provided by the hardware.
1061 setOperationAction(ISD::FNEG, MVT::f64, Expand);
1062 setOperationAction(ISD::FABS, MVT::f64, Expand);
1063 setOperationAction(ISD::FSQRT, MVT::f64, Expand);
1064 setOperationAction(ISD::FSIN, MVT::f64, Expand);
1065 setOperationAction(ISD::FCOS, MVT::f64, Expand);
1066 setOperationAction(ISD::FPOW, MVT::f64, Expand);
1067 setOperationAction(ISD::FLOG, MVT::f64, Expand);
1068 setOperationAction(ISD::FLOG2, MVT::f64, Expand);
1069 setOperationAction(ISD::FLOG10, MVT::f64, Expand);
1070 setOperationAction(ISD::FEXP, MVT::f64, Expand);
1071 setOperationAction(ISD::FEXP2, MVT::f64, Expand);
1072 setOperationAction(ISD::FEXP10, MVT::f64, Expand);
1073 setOperationAction(ISD::FCEIL, MVT::f64, Expand);
1074 setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
1075 setOperationAction(ISD::FRINT, MVT::f64, Expand);
1076 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
1077 setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
1090 }
1091
1092 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
1093 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
1095 if (Subtarget->hasFullFP16()) {
1098 }
1099 }
1100
1101 if (!Subtarget->hasFP16()) {
1102 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
1104 }
1105
1107
1108 // ARM does not have floating-point extending loads.
1109 for (MVT VT : MVT::fp_valuetypes()) {
1110 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1111 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1112 }
1113
1114 // ... or truncating stores
1115 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
1116 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
1117 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
1118
1119 // ARM does not have i1 sign extending load.
1120 for (MVT VT : MVT::integer_valuetypes())
1121 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
1122
1123 // ARM supports all 4 flavors of integer indexed load / store.
1124 if (!Subtarget->isThumb1Only()) {
1125 for (unsigned im = (unsigned)ISD::PRE_INC;
1127 setIndexedLoadAction(im, MVT::i1, Legal);
1128 setIndexedLoadAction(im, MVT::i8, Legal);
1129 setIndexedLoadAction(im, MVT::i16, Legal);
1130 setIndexedLoadAction(im, MVT::i32, Legal);
1131 setIndexedStoreAction(im, MVT::i1, Legal);
1132 setIndexedStoreAction(im, MVT::i8, Legal);
1133 setIndexedStoreAction(im, MVT::i16, Legal);
1134 setIndexedStoreAction(im, MVT::i32, Legal);
1135 }
1136 } else {
1137 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
1140 }
1141
1146
1149 if (Subtarget->hasDSP()) {
1158 }
1159 if (Subtarget->hasBaseDSP()) {
1162 }
1163
1164 // i64 operation support.
1167 if (Subtarget->isThumb1Only()) {
1170 }
1171 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1172 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1174
1182 setOperationAction(ISD::LOAD, MVT::i64, Custom);
1183 setOperationAction(ISD::STORE, MVT::i64, Custom);
1184
1185 // MVE lowers 64 bit shifts to lsll and lsrl
1186 // assuming that ISD::SRL and SRA of i64 are already marked custom
1187 if (Subtarget->hasMVEIntegerOps())
1189
1190 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1191 if (Subtarget->isThumb1Only()) {
1195 }
1196
1197 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1199
1200 // ARM does not have ROTL.
1205 }
1208 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1211 }
1212
1213 // @llvm.readcyclecounter requires the Performance Monitors extension.
1214 // Default to the 0 expansion on unsupported platforms.
1215 // FIXME: Technically there are older ARM CPUs that have
1216 // implementation-specific ways of obtaining this information.
1217 if (Subtarget->hasPerfMon())
1218 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
1219
1220 // Only ARMv6 has BSWAP.
1221 if (!Subtarget->hasV6Ops())
1223
1224 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1225 : Subtarget->hasDivideInARMMode();
1226 if (!hasDivide) {
1227 // These are expanded into libcalls if the cpu doesn't have HW divider.
1230 }
1231
1232 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
1235
1238 }
1239
1242
1243 // Register based DivRem for AEABI (RTABI 4.2)
1244 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
1245 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
1246 Subtarget->isTargetWindows()) {
1249 HasStandaloneRem = false;
1250
1251 if (Subtarget->isTargetWindows()) {
1252 const struct {
1253 const RTLIB::Libcall Op;
1254 const char * const Name;
1255 const CallingConv::ID CC;
1256 } LibraryCalls[] = {
1257 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
1258 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
1259 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
1260 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
1261
1262 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
1263 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
1264 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
1265 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
1266 };
1267
1268 for (const auto &LC : LibraryCalls) {
1269 setLibcallName(LC.Op, LC.Name);
1270 setLibcallCallingConv(LC.Op, LC.CC);
1271 }
1272 } else {
1273 const struct {
1274 const RTLIB::Libcall Op;
1275 const char * const Name;
1276 const CallingConv::ID CC;
1277 } LibraryCalls[] = {
1278 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1279 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1280 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1281 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
1282
1283 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1284 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1285 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1286 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
1287 };
1288
1289 for (const auto &LC : LibraryCalls) {
1290 setLibcallName(LC.Op, LC.Name);
1291 setLibcallCallingConv(LC.Op, LC.CC);
1292 }
1293 }
1294
1299 } else {
1302 }
1303
1308
1309 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1310 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1311
1312 // Use the default implementation.
1313 setOperationAction(ISD::VASTART, MVT::Other, Custom);
1314 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1315 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
1316 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1317 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
1318 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
1319
1320 if (Subtarget->isTargetWindows())
1321 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
1322 else
1323 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
1324
1325 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1326 // the default expansion.
1327 InsertFencesForAtomic = false;
1328 if (Subtarget->hasAnyDataBarrier() &&
1329 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1330 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1331 // to ldrex/strex loops already.
1332 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
1333 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1334 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
1335
1336 // On v8, we have particularly efficient implementations of atomic fences
1337 // if they can be combined with nearby atomic loads and stores.
1338 if (!Subtarget->hasAcquireRelease() ||
1339 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1340 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1341 InsertFencesForAtomic = true;
1342 }
1343 } else {
1344 // If there's anything we can use as a barrier, go through custom lowering
1345 // for ATOMIC_FENCE.
1346 // If target has DMB in thumb, Fences can be inserted.
1347 if (Subtarget->hasDataBarrier())
1348 InsertFencesForAtomic = true;
1349
1350 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
1351 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1352
1353 // Set them all for libcall, which will force libcalls.
1354 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
1355 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
1356 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
1357 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
1358 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, LibCall);
1359 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
1360 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
1361 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, LibCall);
1362 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, LibCall);
1363 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, LibCall);
1364 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, LibCall);
1365 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, LibCall);
1366 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1367 // Unordered/Monotonic case.
1368 if (!InsertFencesForAtomic) {
1369 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1370 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1371 }
1372 }
1373
1374 // Compute supported atomic widths.
1375 if (Subtarget->isTargetLinux() ||
1376 (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1377 // For targets where __sync_* routines are reliably available, we use them
1378 // if necessary.
1379 //
1380 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1381 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1382 //
1383 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1384 // such targets should provide __sync_* routines, which use the ARM mode
1385 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1386 // encoding; see ARMISD::MEMBARRIER_MCR.)
1388 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1389 Subtarget->hasForced32BitAtomics()) {
1390 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1392 } else {
1393 // We can't assume anything about other targets; just use libatomic
1394 // routines.
1396 }
1397
1399
1400 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
1401
1402 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1403 if (!Subtarget->hasV6Ops()) {
1406 }
1408
1409 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1410 !Subtarget->isThumb1Only()) {
1411 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1412 // iff target supports vfp2.
1413 setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1415 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
1416 setOperationAction(ISD::GET_FPENV, MVT::i32, Legal);
1417 setOperationAction(ISD::SET_FPENV, MVT::i32, Legal);
1418 setOperationAction(ISD::RESET_FPENV, MVT::Other, Legal);
1419 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
1420 setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
1421 setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
1422 }
1423
1424 // We want to custom lower some of our intrinsics.
1429 if (Subtarget->useSjLjEH())
1430 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
1431
1441 if (Subtarget->hasFullFP16()) {
1445 }
1446
1448
1449 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
1450 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
1451 if (Subtarget->hasFullFP16())
1452 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
1453 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
1454 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
1455 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1456
1457 // We don't support sin/cos/fmod/copysign/pow
1458 setOperationAction(ISD::FSIN, MVT::f64, Expand);
1459 setOperationAction(ISD::FSIN, MVT::f32, Expand);
1460 setOperationAction(ISD::FCOS, MVT::f32, Expand);
1461 setOperationAction(ISD::FCOS, MVT::f64, Expand);
1462 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1463 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1466 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1467 !Subtarget->isThumb1Only()) {
1470 }
1471 setOperationAction(ISD::FPOW, MVT::f64, Expand);
1472 setOperationAction(ISD::FPOW, MVT::f32, Expand);
1473
1474 if (!Subtarget->hasVFP4Base()) {
1477 }
1478
1479 // Various VFP goodness
1480 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1481 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1482 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1483 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1484 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1485 }
1486
1487 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1488 if (!Subtarget->hasFP16()) {
1489 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1490 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1491 }
1492
1493 // Strict floating-point comparisons need custom lowering.
1500 }
1501
1502 // Use __sincos_stret if available.
1503 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1504 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1505 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1506 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1507 }
1508
1509 // FP-ARMv8 implements a lot of rounding-like FP operations.
1510 if (Subtarget->hasFPARMv8Base()) {
1511 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1512 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1513 setOperationAction(ISD::FROUND, MVT::f32, Legal);
1514 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1515 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1516 setOperationAction(ISD::FRINT, MVT::f32, Legal);
1517 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
1518 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
1519 if (Subtarget->hasNEON()) {
1520 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1521 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1522 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1523 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1524 }
1525
1526 if (Subtarget->hasFP64()) {
1527 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1528 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1529 setOperationAction(ISD::FROUND, MVT::f64, Legal);
1530 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1531 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1532 setOperationAction(ISD::FRINT, MVT::f64, Legal);
1533 setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1534 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1535 }
1536 }
1537
1538 // FP16 often need to be promoted to call lib functions
1539 if (Subtarget->hasFullFP16()) {
1542 setOperationAction(ISD::FSIN, MVT::f16, Promote);
1543 setOperationAction(ISD::FCOS, MVT::f16, Promote);
1544 setOperationAction(ISD::FTAN, MVT::f16, Promote);
1545 setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
1546 setOperationAction(ISD::FPOWI, MVT::f16, Promote);
1547 setOperationAction(ISD::FPOW, MVT::f16, Promote);
1548 setOperationAction(ISD::FEXP, MVT::f16, Promote);
1549 setOperationAction(ISD::FEXP2, MVT::f16, Promote);
1550 setOperationAction(ISD::FEXP10, MVT::f16, Promote);
1551 setOperationAction(ISD::FLOG, MVT::f16, Promote);
1552 setOperationAction(ISD::FLOG10, MVT::f16, Promote);
1553 setOperationAction(ISD::FLOG2, MVT::f16, Promote);
1554
1555 setOperationAction(ISD::FROUND, MVT::f16, Legal);
1556 }
1557
1558 if (Subtarget->hasNEON()) {
1559 // vmin and vmax aren't available in a scalar form, so we can use
1560 // a NEON instruction with an undef lane instead.
1561 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
1562 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
1563 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
1564 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
1565 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
1566 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
1567 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
1568 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
1569
1570 if (Subtarget->hasFullFP16()) {
1571 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
1572 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
1573 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
1574 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
1575
1576 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
1577 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
1578 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
1579 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
1580 }
1581 }
1582
1583 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1584 // it, but it's just a wrapper around ldexp.
1585 if (Subtarget->isTargetWindows()) {
1586 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1587 if (isOperationExpand(Op, MVT::f32))
1588 setOperationAction(Op, MVT::f32, Promote);
1589 }
1590
1591 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1592 // isn't legal.
1593 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1594 if (isOperationExpand(Op, MVT::f16))
1595 setOperationAction(Op, MVT::f16, Promote);
1596
1597 // We have target-specific dag combine patterns for the following nodes:
1598 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1601
1602 if (Subtarget->hasMVEIntegerOps())
1604
1605 if (Subtarget->hasV6Ops())
1607 if (Subtarget->isThumb1Only())
1609 // Attempt to lower smin/smax to ssat/usat
1610 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1611 Subtarget->isThumb2()) {
1613 }
1614
1616
1617 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1618 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1620 else
1622
1623 //// temporary - rewrite interface to use type
1626 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1628 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1630
1631 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1632 // are at least 4 bytes aligned.
1634
1635 // Prefer likely predicted branches to selects on out-of-order cores.
1636 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1637
1640 Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1641
1642 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1643}
1644
1646 return Subtarget->useSoftFloat();
1647}
1648
1649// FIXME: It might make sense to define the representative register class as the
1650// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1651// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1652// SPR's representative would be DPR_VFP2. This should work well if register
1653// pressure tracking were modified such that a register use would increment the
1654// pressure of the register class's representative and all of it's super
1655// classes' representatives transitively. We have not implemented this because
1656// of the difficulty prior to coalescing of modeling operand register classes
1657// due to the common occurrence of cross class copies and subregister insertions
1658// and extractions.
1659std::pair<const TargetRegisterClass *, uint8_t>
1661 MVT VT) const {
1662 const TargetRegisterClass *RRC = nullptr;
1663 uint8_t Cost = 1;
1664 switch (VT.SimpleTy) {
1665 default:
1667 // Use DPR as representative register class for all floating point
1668 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1669 // the cost is 1 for both f32 and f64.
1670 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1671 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1672 RRC = &ARM::DPRRegClass;
1673 // When NEON is used for SP, only half of the register file is available
1674 // because operations that define both SP and DP results will be constrained
1675 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1676 // coalescing by double-counting the SP regs. See the FIXME above.
1677 if (Subtarget->useNEONForSinglePrecisionFP())
1678 Cost = 2;
1679 break;
1680 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1681 case MVT::v4f32: case MVT::v2f64:
1682 RRC = &ARM::DPRRegClass;
1683 Cost = 2;
1684 break;
1685 case MVT::v4i64:
1686 RRC = &ARM::DPRRegClass;
1687 Cost = 4;
1688 break;
1689 case MVT::v8i64:
1690 RRC = &ARM::DPRRegClass;
1691 Cost = 8;
1692 break;
1693 }
1694 return std::make_pair(RRC, Cost);
1695}
1696
1697const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1698#define MAKE_CASE(V) \
1699 case V: \
1700 return #V;
1701 switch ((ARMISD::NodeType)Opcode) {
1703 break;
1906#undef MAKE_CASE
1907 }
1908 return nullptr;
1909}
1910
1912 EVT VT) const {
1913 if (!VT.isVector())
1914 return getPointerTy(DL);
1915
1916 // MVE has a predicate register.
1917 if ((Subtarget->hasMVEIntegerOps() &&
1918 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1919 VT == MVT::v16i8)) ||
1920 (Subtarget->hasMVEFloatOps() &&
1921 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1922 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1924}
1925
1926/// getRegClassFor - Return the register class that should be used for the
1927/// specified value type.
1928const TargetRegisterClass *
1929ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1930 (void)isDivergent;
1931 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1932 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1933 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1934 // MVE Q registers.
1935 if (Subtarget->hasNEON()) {
1936 if (VT == MVT::v4i64)
1937 return &ARM::QQPRRegClass;
1938 if (VT == MVT::v8i64)
1939 return &ARM::QQQQPRRegClass;
1940 }
1941 if (Subtarget->hasMVEIntegerOps()) {
1942 if (VT == MVT::v4i64)
1943 return &ARM::MQQPRRegClass;
1944 if (VT == MVT::v8i64)
1945 return &ARM::MQQQQPRRegClass;
1946 }
1948}
1949
1950// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1951// source/dest is aligned and the copy size is large enough. We therefore want
1952// to align such objects passed to memory intrinsics.
1954 Align &PrefAlign) const {
1955 if (!isa<MemIntrinsic>(CI))
1956 return false;
1957 MinSize = 8;
1958 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1959 // cycle faster than 4-byte aligned LDM.
1960 PrefAlign =
1961 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1962 return true;
1963}
1964
1965// Create a fast isel object.
1966FastISel *
1968 const TargetLibraryInfo *libInfo) const {
1969 return ARM::createFastISel(funcInfo, libInfo);
1970}
1971
1973 unsigned NumVals = N->getNumValues();
1974 if (!NumVals)
1975 return Sched::RegPressure;
1976
1977 for (unsigned i = 0; i != NumVals; ++i) {
1978 EVT VT = N->getValueType(i);
1979 if (VT == MVT::Glue || VT == MVT::Other)
1980 continue;
1981 if (VT.isFloatingPoint() || VT.isVector())
1982 return Sched::ILP;
1983 }
1984
1985 if (!N->isMachineOpcode())
1986 return Sched::RegPressure;
1987
1988 // Load are scheduled for latency even if there instruction itinerary
1989 // is not available.
1990 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1991 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1992
1993 if (MCID.getNumDefs() == 0)
1994 return Sched::RegPressure;
1995 if (!Itins->isEmpty() &&
1996 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1997 return Sched::ILP;
1998
1999 return Sched::RegPressure;
2000}
2001
2002//===----------------------------------------------------------------------===//
2003// Lowering Code
2004//===----------------------------------------------------------------------===//
2005
2006static bool isSRL16(const SDValue &Op) {
2007 if (Op.getOpcode() != ISD::SRL)
2008 return false;
2009 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
2010 return Const->getZExtValue() == 16;
2011 return false;
2012}
2013
2014static bool isSRA16(const SDValue &Op) {
2015 if (Op.getOpcode() != ISD::SRA)
2016 return false;
2017 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
2018 return Const->getZExtValue() == 16;
2019 return false;
2020}
2021
2022static bool isSHL16(const SDValue &Op) {
2023 if (Op.getOpcode() != ISD::SHL)
2024 return false;
2025 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
2026 return Const->getZExtValue() == 16;
2027 return false;
2028}
2029
2030// Check for a signed 16-bit value. We special case SRA because it makes it
2031// more simple when also looking for SRAs that aren't sign extending a
2032// smaller value. Without the check, we'd need to take extra care with
2033// checking order for some operations.
2034static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
2035 if (isSRA16(Op))
2036 return isSHL16(Op.getOperand(0));
2037 return DAG.ComputeNumSignBits(Op) == 17;
2038}
2039
2040/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
2042 switch (CC) {
2043 default: llvm_unreachable("Unknown condition code!");
2044 case ISD::SETNE: return ARMCC::NE;
2045 case ISD::SETEQ: return ARMCC::EQ;
2046 case ISD::SETGT: return ARMCC::GT;
2047 case ISD::SETGE: return ARMCC::GE;
2048 case ISD::SETLT: return ARMCC::LT;
2049 case ISD::SETLE: return ARMCC::LE;
2050 case ISD::SETUGT: return ARMCC::HI;
2051 case ISD::SETUGE: return ARMCC::HS;
2052 case ISD::SETULT: return ARMCC::LO;
2053 case ISD::SETULE: return ARMCC::LS;
2054 }
2055}
2056
2057/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
2059 ARMCC::CondCodes &CondCode2) {
2060 CondCode2 = ARMCC::AL;
2061 switch (CC) {
2062 default: llvm_unreachable("Unknown FP condition!");
2063 case ISD::SETEQ:
2064 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
2065 case ISD::SETGT:
2066 case ISD::SETOGT: CondCode = ARMCC::GT; break;
2067 case ISD::SETGE:
2068 case ISD::SETOGE: CondCode = ARMCC::GE; break;
2069 case ISD::SETOLT: CondCode = ARMCC::MI; break;
2070 case ISD::SETOLE: CondCode = ARMCC::LS; break;
2071 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
2072 case ISD::SETO: CondCode = ARMCC::VC; break;
2073 case ISD::SETUO: CondCode = ARMCC::VS; break;
2074 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
2075 case ISD::SETUGT: CondCode = ARMCC::HI; break;
2076 case ISD::SETUGE: CondCode = ARMCC::PL; break;
2077 case ISD::SETLT:
2078 case ISD::SETULT: CondCode = ARMCC::LT; break;
2079 case ISD::SETLE:
2080 case ISD::SETULE: CondCode = ARMCC::LE; break;
2081 case ISD::SETNE:
2082 case ISD::SETUNE: CondCode = ARMCC::NE; break;
2083 }
2084}
2085
2086//===----------------------------------------------------------------------===//
2087// Calling Convention Implementation
2088//===----------------------------------------------------------------------===//
2089
2090/// getEffectiveCallingConv - Get the effective calling convention, taking into
2091/// account presence of floating point hardware and calling convention
2092/// limitations, such as support for variadic functions.
2093CallingConv::ID
2094ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
2095 bool isVarArg) const {
2096 switch (CC) {
2097 default:
2098 report_fatal_error("Unsupported calling convention");
2101 case CallingConv::GHC:
2103 return CC;
2109 case CallingConv::Swift:
2112 case CallingConv::C:
2113 case CallingConv::Tail:
2114 if (!Subtarget->isAAPCS_ABI())
2115 return CallingConv::ARM_APCS;
2116 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
2117 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
2118 !isVarArg)
2120 else
2122 case CallingConv::Fast:
2124 if (!Subtarget->isAAPCS_ABI()) {
2125 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
2126 return CallingConv::Fast;
2127 return CallingConv::ARM_APCS;
2128 } else if (Subtarget->hasVFP2Base() &&
2129 !Subtarget->isThumb1Only() && !isVarArg)
2131 else
2133 }
2134}
2135
2137 bool isVarArg) const {
2138 return CCAssignFnForNode(CC, false, isVarArg);
2139}
2140
2142 bool isVarArg) const {
2143 return CCAssignFnForNode(CC, true, isVarArg);
2144}
2145
2146/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
2147/// CallingConvention.
2148CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
2149 bool Return,
2150 bool isVarArg) const {
2151 switch (getEffectiveCallingConv(CC, isVarArg)) {
2152 default:
2153 report_fatal_error("Unsupported calling convention");
2155 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
2157 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2159 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
2160 case CallingConv::Fast:
2161 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
2162 case CallingConv::GHC:
2163 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2165 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2167 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2169 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2170 }
2171}
2172
2173SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2174 MVT LocVT, MVT ValVT, SDValue Val) const {
2175 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
2176 Val);
2177 if (Subtarget->hasFullFP16()) {
2178 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
2179 } else {
2180 Val = DAG.getNode(ISD::TRUNCATE, dl,
2181 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2182 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
2183 }
2184 return Val;
2185}
2186
2187SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2188 MVT LocVT, MVT ValVT,
2189 SDValue Val) const {
2190 if (Subtarget->hasFullFP16()) {
2191 Val = DAG.getNode(ARMISD::VMOVrh, dl,
2192 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2193 } else {
2194 Val = DAG.getNode(ISD::BITCAST, dl,
2195 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2196 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
2197 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2198 }
2199 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
2200}
2201
2202/// LowerCallResult - Lower the result values of a call into the
2203/// appropriate copies out of appropriate physical registers.
2204SDValue ARMTargetLowering::LowerCallResult(
2205 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
2206 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2207 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2208 SDValue ThisVal, bool isCmseNSCall) const {
2209 // Assign locations to each value returned by this call.
2211 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2212 *DAG.getContext());
2213 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
2214
2215 // Copy all of the result registers out of their specified physreg.
2216 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2217 CCValAssign VA = RVLocs[i];
2218
2219 // Pass 'this' value directly from the argument to return value, to avoid
2220 // reg unit interference
2221 if (i == 0 && isThisReturn) {
2222 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2223 "unexpected return calling convention register assignment");
2224 InVals.push_back(ThisVal);
2225 continue;
2226 }
2227
2228 SDValue Val;
2229 if (VA.needsCustom() &&
2230 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2231 // Handle f64 or half of a v2f64.
2232 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2233 InGlue);
2234 Chain = Lo.getValue(1);
2235 InGlue = Lo.getValue(2);
2236 VA = RVLocs[++i]; // skip ahead to next loc
2237 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2238 InGlue);
2239 Chain = Hi.getValue(1);
2240 InGlue = Hi.getValue(2);
2241 if (!Subtarget->isLittle())
2242 std::swap (Lo, Hi);
2243 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2244
2245 if (VA.getLocVT() == MVT::v2f64) {
2246 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2247 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2248 DAG.getConstant(0, dl, MVT::i32));
2249
2250 VA = RVLocs[++i]; // skip ahead to next loc
2251 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2252 Chain = Lo.getValue(1);
2253 InGlue = Lo.getValue(2);
2254 VA = RVLocs[++i]; // skip ahead to next loc
2255 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2256 Chain = Hi.getValue(1);
2257 InGlue = Hi.getValue(2);
2258 if (!Subtarget->isLittle())
2259 std::swap (Lo, Hi);
2260 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2261 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2262 DAG.getConstant(1, dl, MVT::i32));
2263 }
2264 } else {
2265 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
2266 InGlue);
2267 Chain = Val.getValue(1);
2268 InGlue = Val.getValue(2);
2269 }
2270
2271 switch (VA.getLocInfo()) {
2272 default: llvm_unreachable("Unknown loc info!");
2273 case CCValAssign::Full: break;
2274 case CCValAssign::BCvt:
2275 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
2276 break;
2277 }
2278
2279 // f16 arguments have their size extended to 4 bytes and passed as if they
2280 // had been copied to the LSBs of a 32-bit register.
2281 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2282 if (VA.needsCustom() &&
2283 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2284 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
2285
2286 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
2287 // is less than 32 bits must be sign- or zero-extended after the call for
2288 // security reasons. Although the ABI mandates an extension done by the
2289 // callee, the latter cannot be trusted to follow the rules of the ABI.
2290 const ISD::InputArg &Arg = Ins[VA.getValNo()];
2291 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
2292 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
2293 Val = handleCMSEValue(Val, Arg, DAG, dl);
2294
2295 InVals.push_back(Val);
2296 }
2297
2298 return Chain;
2299}
2300
2301std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
2302 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
2303 bool IsTailCall, int SPDiff) const {
2304 SDValue DstAddr;
2305 MachinePointerInfo DstInfo;
2306 int32_t Offset = VA.getLocMemOffset();
2308
2309 if (IsTailCall) {
2310 Offset += SPDiff;
2311 auto PtrVT = getPointerTy(DAG.getDataLayout());
2312 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
2313 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
2314 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2315 DstInfo =
2317 } else {
2318 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
2319 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2320 StackPtr, PtrOff);
2321 DstInfo =
2323 }
2324
2325 return std::make_pair(DstAddr, DstInfo);
2326}
2327
2328void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2329 SDValue Chain, SDValue &Arg,
2330 RegsToPassVector &RegsToPass,
2331 CCValAssign &VA, CCValAssign &NextVA,
2332 SDValue &StackPtr,
2333 SmallVectorImpl<SDValue> &MemOpChains,
2334 bool IsTailCall,
2335 int SPDiff) const {
2336 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2337 DAG.getVTList(MVT::i32, MVT::i32), Arg);
2338 unsigned id = Subtarget->isLittle() ? 0 : 1;
2339 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2340
2341 if (NextVA.isRegLoc())
2342 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2343 else {
2344 assert(NextVA.isMemLoc());
2345 if (!StackPtr.getNode())
2346 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2348
2349 SDValue DstAddr;
2350 MachinePointerInfo DstInfo;
2351 std::tie(DstAddr, DstInfo) =
2352 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2353 MemOpChains.push_back(
2354 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2355 }
2356}
2357
2358static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2359 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2361}
2362
2363/// LowerCall - Lowering a call into a callseq_start <-
2364/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2365/// nodes.
2366SDValue
2367ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2368 SmallVectorImpl<SDValue> &InVals) const {
2369 SelectionDAG &DAG = CLI.DAG;
2370 SDLoc &dl = CLI.DL;
2372 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2374 SDValue Chain = CLI.Chain;
2375 SDValue Callee = CLI.Callee;
2376 bool &isTailCall = CLI.IsTailCall;
2377 CallingConv::ID CallConv = CLI.CallConv;
2378 bool doesNotRet = CLI.DoesNotReturn;
2379 bool isVarArg = CLI.IsVarArg;
2380
2384 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2385 bool isThisReturn = false;
2386 bool isCmseNSCall = false;
2387 bool isSibCall = false;
2388 bool PreferIndirect = false;
2389 bool GuardWithBTI = false;
2390
2391 // Analyze operands of the call, assigning locations to each operand.
2393 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2394 *DAG.getContext());
2395 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2396
2397 // Lower 'returns_twice' calls to a pseudo-instruction.
2398 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2399 !Subtarget->noBTIAtReturnTwice())
2400 GuardWithBTI = AFI->branchTargetEnforcement();
2401
2402 // Determine whether this is a non-secure function call.
2403 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2404 isCmseNSCall = true;
2405
2406 // Disable tail calls if they're not supported.
2407 if (!Subtarget->supportsTailCall())
2408 isTailCall = false;
2409
2410 // For both the non-secure calls and the returns from a CMSE entry function,
2411 // the function needs to do some extra work afte r the call, or before the
2412 // return, respectively, thus it cannot end with atail call
2413 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2414 isTailCall = false;
2415
2416 if (isa<GlobalAddressSDNode>(Callee)) {
2417 // If we're optimizing for minimum size and the function is called three or
2418 // more times in this block, we can improve codesize by calling indirectly
2419 // as BLXr has a 16-bit encoding.
2420 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2421 if (CLI.CB) {
2422 auto *BB = CLI.CB->getParent();
2423 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2424 count_if(GV->users(), [&BB](const User *U) {
2425 return isa<Instruction>(U) &&
2426 cast<Instruction>(U)->getParent() == BB;
2427 }) > 2;
2428 }
2429 }
2430 if (isTailCall) {
2431 // Check if it's really possible to do a tail call.
2432 isTailCall =
2433 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2434
2435 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2436 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2437 isSibCall = true;
2438
2439 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2440 // detected sibcalls.
2441 if (isTailCall)
2442 ++NumTailCalls;
2443 }
2444
2445 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2446 report_fatal_error("failed to perform tail call elimination on a call "
2447 "site marked musttail");
2448
2449 // Get a count of how many bytes are to be pushed on the stack.
2450 unsigned NumBytes = CCInfo.getStackSize();
2451
2452 // SPDiff is the byte offset of the call's argument area from the callee's.
2453 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2454 // by this amount for a tail call. In a sibling call it must be 0 because the
2455 // caller will deallocate the entire stack and the callee still expects its
2456 // arguments to begin at SP+0. Completely unused for non-tail calls.
2457 int SPDiff = 0;
2458
2459 if (isTailCall && !isSibCall) {
2460 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2461 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2462
2463 // Since callee will pop argument stack as a tail call, we must keep the
2464 // popped size 16-byte aligned.
2465 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
2466 assert(StackAlign && "data layout string is missing stack alignment");
2467 NumBytes = alignTo(NumBytes, *StackAlign);
2468
2469 // SPDiff will be negative if this tail call requires more space than we
2470 // would automatically have in our incoming argument space. Positive if we
2471 // can actually shrink the stack.
2472 SPDiff = NumReusableBytes - NumBytes;
2473
2474 // If this call requires more stack than we have available from
2475 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2476 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2477 AFI->setArgRegsSaveSize(-SPDiff);
2478 }
2479
2480 if (isSibCall) {
2481 // For sibling tail calls, memory operands are available in our caller's stack.
2482 NumBytes = 0;
2483 } else {
2484 // Adjust the stack pointer for the new arguments...
2485 // These operations are automatically eliminated by the prolog/epilog pass
2486 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2487 }
2488
2490 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2491
2492 RegsToPassVector RegsToPass;
2493 SmallVector<SDValue, 8> MemOpChains;
2494
2495 // During a tail call, stores to the argument area must happen after all of
2496 // the function's incoming arguments have been loaded because they may alias.
2497 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2498 // there's no point in doing so repeatedly so this tracks whether that's
2499 // happened yet.
2500 bool AfterFormalArgLoads = false;
2501
2502 // Walk the register/memloc assignments, inserting copies/loads. In the case
2503 // of tail call optimization, arguments are handled later.
2504 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2505 i != e;
2506 ++i, ++realArgIdx) {
2507 CCValAssign &VA = ArgLocs[i];
2508 SDValue Arg = OutVals[realArgIdx];
2509 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2510 bool isByVal = Flags.isByVal();
2511
2512 // Promote the value if needed.
2513 switch (VA.getLocInfo()) {
2514 default: llvm_unreachable("Unknown loc info!");
2515 case CCValAssign::Full: break;
2516 case CCValAssign::SExt:
2517 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2518 break;
2519 case CCValAssign::ZExt:
2520 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2521 break;
2522 case CCValAssign::AExt:
2523 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2524 break;
2525 case CCValAssign::BCvt:
2526 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2527 break;
2528 }
2529
2530 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2531 Chain = DAG.getStackArgumentTokenFactor(Chain);
2532 AfterFormalArgLoads = true;
2533 }
2534
2535 // f16 arguments have their size extended to 4 bytes and passed as if they
2536 // had been copied to the LSBs of a 32-bit register.
2537 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2538 if (VA.needsCustom() &&
2539 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2540 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2541 } else {
2542 // f16 arguments could have been extended prior to argument lowering.
2543 // Mask them arguments if this is a CMSE nonsecure call.
2544 auto ArgVT = Outs[realArgIdx].ArgVT;
2545 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2546 auto LocBits = VA.getLocVT().getSizeInBits();
2547 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2548 SDValue Mask =
2549 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2550 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2551 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2552 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2553 }
2554 }
2555
2556 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2557 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2558 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2559 DAG.getConstant(0, dl, MVT::i32));
2560 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2561 DAG.getConstant(1, dl, MVT::i32));
2562
2563 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2564 StackPtr, MemOpChains, isTailCall, SPDiff);
2565
2566 VA = ArgLocs[++i]; // skip ahead to next loc
2567 if (VA.isRegLoc()) {
2568 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2569 StackPtr, MemOpChains, isTailCall, SPDiff);
2570 } else {
2571 assert(VA.isMemLoc());
2572 SDValue DstAddr;
2573 MachinePointerInfo DstInfo;
2574 std::tie(DstAddr, DstInfo) =
2575 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2576 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2577 }
2578 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2579 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2580 StackPtr, MemOpChains, isTailCall, SPDiff);
2581 } else if (VA.isRegLoc()) {
2582 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2583 Outs[0].VT == MVT::i32) {
2584 assert(VA.getLocVT() == MVT::i32 &&
2585 "unexpected calling convention register assignment");
2586 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2587 "unexpected use of 'returned'");
2588 isThisReturn = true;
2589 }
2590 const TargetOptions &Options = DAG.getTarget().Options;
2591 if (Options.EmitCallSiteInfo)
2592 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2593 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2594 } else if (isByVal) {
2595 assert(VA.isMemLoc());
2596 unsigned offset = 0;
2597
2598 // True if this byval aggregate will be split between registers
2599 // and memory.
2600 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2601 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2602
2603 if (CurByValIdx < ByValArgsCount) {
2604
2605 unsigned RegBegin, RegEnd;
2606 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2607
2608 EVT PtrVT =
2610 unsigned int i, j;
2611 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2612 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2613 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
2614 SDValue Load =
2615 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2616 DAG.InferPtrAlign(AddArg));
2617 MemOpChains.push_back(Load.getValue(1));
2618 RegsToPass.push_back(std::make_pair(j, Load));
2619 }
2620
2621 // If parameter size outsides register area, "offset" value
2622 // helps us to calculate stack slot for remained part properly.
2623 offset = RegEnd - RegBegin;
2624
2625 CCInfo.nextInRegsParam();
2626 }
2627
2628 if (Flags.getByValSize() > 4*offset) {
2629 auto PtrVT = getPointerTy(DAG.getDataLayout());
2630 SDValue Dst;
2631 MachinePointerInfo DstInfo;
2632 std::tie(Dst, DstInfo) =
2633 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2634 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2635 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
2636 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2637 MVT::i32);
2638 SDValue AlignNode =
2639 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2640
2641 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2642 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2643 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2644 Ops));
2645 }
2646 } else {
2647 assert(VA.isMemLoc());
2648 SDValue DstAddr;
2649 MachinePointerInfo DstInfo;
2650 std::tie(DstAddr, DstInfo) =
2651 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2652
2653 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2654 MemOpChains.push_back(Store);
2655 }
2656 }
2657
2658 if (!MemOpChains.empty())
2659 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2660
2661 // Build a sequence of copy-to-reg nodes chained together with token chain
2662 // and flag operands which copy the outgoing args into the appropriate regs.
2663 SDValue InGlue;
2664 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2665 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2666 RegsToPass[i].second, InGlue);
2667 InGlue = Chain.getValue(1);
2668 }
2669
2670 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2671 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2672 // node so that legalize doesn't hack it.
2673 bool isDirect = false;
2674
2676 const GlobalValue *GVal = nullptr;
2678 GVal = G->getGlobal();
2679 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && Subtarget->isTargetMachO();
2680
2681 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2682 bool isLocalARMFunc = false;
2683 auto PtrVt = getPointerTy(DAG.getDataLayout());
2684
2685 if (Subtarget->genLongCalls()) {
2686 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2687 "long-calls codegen is not position independent!");
2688 // Handle a global address or an external symbol. If it's not one of
2689 // those, the target's already in a register, so we don't need to do
2690 // anything extra.
2691 if (isa<GlobalAddressSDNode>(Callee)) {
2692 if (Subtarget->genExecuteOnly()) {
2693 if (Subtarget->useMovt())
2694 ++NumMovwMovt;
2695 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2696 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2697 } else {
2698 // Create a constant pool entry for the callee address
2699 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2701 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2702
2703 // Get the address of the callee into a register
2704 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2705 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2706 Callee = DAG.getLoad(
2707 PtrVt, dl, DAG.getEntryNode(), Addr,
2709 }
2711 const char *Sym = S->getSymbol();
2712
2713 if (Subtarget->genExecuteOnly()) {
2714 if (Subtarget->useMovt())
2715 ++NumMovwMovt;
2716 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2717 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2718 } else {
2719 // Create a constant pool entry for the callee address
2720 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2722 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2723
2724 // Get the address of the callee into a register
2725 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2726 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2727 Callee = DAG.getLoad(
2728 PtrVt, dl, DAG.getEntryNode(), Addr,
2730 }
2731 }
2732 } else if (isa<GlobalAddressSDNode>(Callee)) {
2733 if (!PreferIndirect) {
2734 isDirect = true;
2735 bool isDef = GVal->isStrongDefinitionForLinker();
2736
2737 // ARM call to a local ARM function is predicable.
2738 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2739 // tBX takes a register source operand.
2740 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2741 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2742 Callee = DAG.getNode(
2743 ARMISD::WrapperPIC, dl, PtrVt,
2744 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2745 Callee = DAG.getLoad(
2746 PtrVt, dl, DAG.getEntryNode(), Callee,
2750 } else if (Subtarget->isTargetCOFF()) {
2751 assert(Subtarget->isTargetWindows() &&
2752 "Windows is the only supported COFF target");
2753 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2754 if (GVal->hasDLLImportStorageClass())
2755 TargetFlags = ARMII::MO_DLLIMPORT;
2756 else if (!TM.shouldAssumeDSOLocal(GVal))
2757 TargetFlags = ARMII::MO_COFFSTUB;
2758 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2759 TargetFlags);
2760 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2761 Callee =
2762 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2763 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2765 } else {
2766 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2767 }
2768 }
2769 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2770 isDirect = true;
2771 // tBX takes a register source operand.
2772 const char *Sym = S->getSymbol();
2773 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2774 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2777 ARMPCLabelIndex, 4);
2778 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2779 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2780 Callee = DAG.getLoad(
2781 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2783 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2784 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2785 } else {
2786 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2787 }
2788 }
2789
2790 if (isCmseNSCall) {
2791 assert(!isARMFunc && !isDirect &&
2792 "Cannot handle call to ARM function or direct call");
2793 if (NumBytes > 0) {
2795 "call to non-secure function would "
2796 "require passing arguments on stack",
2797 dl.getDebugLoc());
2798 DAG.getContext()->diagnose(Diag);
2799 }
2800 if (isStructRet) {
2803 "call to non-secure function would return value through pointer",
2804 dl.getDebugLoc());
2805 DAG.getContext()->diagnose(Diag);
2806 }
2807 }
2808
2809 // FIXME: handle tail calls differently.
2810 unsigned CallOpc;
2811 if (Subtarget->isThumb()) {
2812 if (GuardWithBTI)
2813 CallOpc = ARMISD::t2CALL_BTI;
2814 else if (isCmseNSCall)
2815 CallOpc = ARMISD::tSECALL;
2816 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2817 CallOpc = ARMISD::CALL_NOLINK;
2818 else
2819 CallOpc = ARMISD::CALL;
2820 } else {
2821 if (!isDirect && !Subtarget->hasV5TOps())
2822 CallOpc = ARMISD::CALL_NOLINK;
2823 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2824 // Emit regular call when code size is the priority
2825 !Subtarget->hasMinSize())
2826 // "mov lr, pc; b _foo" to avoid confusing the RSP
2827 CallOpc = ARMISD::CALL_NOLINK;
2828 else
2829 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2830 }
2831
2832 // We don't usually want to end the call-sequence here because we would tidy
2833 // the frame up *after* the call, however in the ABI-changing tail-call case
2834 // we've carefully laid out the parameters so that when sp is reset they'll be
2835 // in the correct location.
2836 if (isTailCall && !isSibCall) {
2837 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2838 InGlue = Chain.getValue(1);
2839 }
2840
2841 std::vector<SDValue> Ops;
2842 Ops.push_back(Chain);
2843 Ops.push_back(Callee);
2844
2845 if (isTailCall) {
2846 Ops.push_back(
2847 DAG.getSignedConstant(SPDiff, dl, MVT::i32, /*isTarget=*/true));
2848 }
2849
2850 // Add argument registers to the end of the list so that they are known live
2851 // into the call.
2852 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2853 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2854 RegsToPass[i].second.getValueType()));
2855
2856 // Add a register mask operand representing the call-preserved registers.
2857 const uint32_t *Mask;
2858 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2859 if (isThisReturn) {
2860 // For 'this' returns, use the R0-preserving mask if applicable
2861 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2862 if (!Mask) {
2863 // Set isThisReturn to false if the calling convention is not one that
2864 // allows 'returned' to be modeled in this way, so LowerCallResult does
2865 // not try to pass 'this' straight through
2866 isThisReturn = false;
2867 Mask = ARI->getCallPreservedMask(MF, CallConv);
2868 }
2869 } else
2870 Mask = ARI->getCallPreservedMask(MF, CallConv);
2871
2872 assert(Mask && "Missing call preserved mask for calling convention");
2873 Ops.push_back(DAG.getRegisterMask(Mask));
2874
2875 if (InGlue.getNode())
2876 Ops.push_back(InGlue);
2877
2878 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2879 if (isTailCall) {
2881 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
2882 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2883 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2884 return Ret;
2885 }
2886
2887 // Returns a chain and a flag for retval copy to use.
2888 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
2889 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2890 InGlue = Chain.getValue(1);
2891 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2892
2893 // If we're guaranteeing tail-calls will be honoured, the callee must
2894 // pop its own argument stack on return. But this call is *not* a tail call so
2895 // we need to undo that after it returns to restore the status-quo.
2896 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2897 uint64_t CalleePopBytes =
2898 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1U;
2899
2900 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2901 if (!Ins.empty())
2902 InGlue = Chain.getValue(1);
2903
2904 // Handle result values, copying them out of physregs into vregs that we
2905 // return.
2906 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2907 InVals, isThisReturn,
2908 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2909}
2910
2911/// HandleByVal - Every parameter *after* a byval parameter is passed
2912/// on the stack. Remember the next parameter register to allocate,
2913/// and then confiscate the rest of the parameter registers to insure
2914/// this.
2915void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2916 Align Alignment) const {
2917 // Byval (as with any stack) slots are always at least 4 byte aligned.
2918 Alignment = std::max(Alignment, Align(4));
2919
2920 MCRegister Reg = State->AllocateReg(GPRArgRegs);
2921 if (!Reg)
2922 return;
2923
2924 unsigned AlignInRegs = Alignment.value() / 4;
2925 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2926 for (unsigned i = 0; i < Waste; ++i)
2927 Reg = State->AllocateReg(GPRArgRegs);
2928
2929 if (!Reg)
2930 return;
2931
2932 unsigned Excess = 4 * (ARM::R4 - Reg);
2933
2934 // Special case when NSAA != SP and parameter size greater than size of
2935 // all remained GPR regs. In that case we can't split parameter, we must
2936 // send it to stack. We also must set NCRN to R4, so waste all
2937 // remained registers.
2938 const unsigned NSAAOffset = State->getStackSize();
2939 if (NSAAOffset != 0 && Size > Excess) {
2940 while (State->AllocateReg(GPRArgRegs))
2941 ;
2942 return;
2943 }
2944
2945 // First register for byval parameter is the first register that wasn't
2946 // allocated before this method call, so it would be "reg".
2947 // If parameter is small enough to be saved in range [reg, r4), then
2948 // the end (first after last) register would be reg + param-size-in-regs,
2949 // else parameter would be splitted between registers and stack,
2950 // end register would be r4 in this case.
2951 unsigned ByValRegBegin = Reg;
2952 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2953 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2954 // Note, first register is allocated in the beginning of function already,
2955 // allocate remained amount of registers we need.
2956 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2957 State->AllocateReg(GPRArgRegs);
2958 // A byval parameter that is split between registers and memory needs its
2959 // size truncated here.
2960 // In the case where the entire structure fits in registers, we set the
2961 // size in memory to zero.
2962 Size = std::max<int>(Size - Excess, 0);
2963}
2964
2965/// MatchingStackOffset - Return true if the given stack call argument is
2966/// already available in the same position (relatively) of the caller's
2967/// incoming argument stack.
2968static
2971 const TargetInstrInfo *TII) {
2972 unsigned Bytes = Arg.getValueSizeInBits() / 8;
2973 int FI = std::numeric_limits<int>::max();
2974 if (Arg.getOpcode() == ISD::CopyFromReg) {
2975 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2976 if (!VR.isVirtual())
2977 return false;
2978 MachineInstr *Def = MRI->getVRegDef(VR);
2979 if (!Def)
2980 return false;
2981 if (!Flags.isByVal()) {
2982 if (!TII->isLoadFromStackSlot(*Def, FI))
2983 return false;
2984 } else {
2985 return false;
2986 }
2987 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2988 if (Flags.isByVal())
2989 // ByVal argument is passed in as a pointer but it's now being
2990 // dereferenced. e.g.
2991 // define @foo(%struct.X* %A) {
2992 // tail call @bar(%struct.X* byval %A)
2993 // }
2994 return false;
2995 SDValue Ptr = Ld->getBasePtr();
2997 if (!FINode)
2998 return false;
2999 FI = FINode->getIndex();
3000 } else
3001 return false;
3002
3003 assert(FI != std::numeric_limits<int>::max());
3004 if (!MFI.isFixedObjectIndex(FI))
3005 return false;
3006 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
3007}
3008
3009/// IsEligibleForTailCallOptimization - Check whether the call is eligible
3010/// for tail call optimization. Targets which want to do tail call
3011/// optimization should implement this function. Note that this function also
3012/// processes musttail calls, so when this function returns false on a valid
3013/// musttail call, a fatal backend error occurs.
3014bool ARMTargetLowering::IsEligibleForTailCallOptimization(
3016 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
3017 CallingConv::ID CalleeCC = CLI.CallConv;
3018 SDValue Callee = CLI.Callee;
3019 bool isVarArg = CLI.IsVarArg;
3020 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3021 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3023 const SelectionDAG &DAG = CLI.DAG;
3025 const Function &CallerF = MF.getFunction();
3026 CallingConv::ID CallerCC = CallerF.getCallingConv();
3027
3028 assert(Subtarget->supportsTailCall());
3029
3030 // Indirect tail-calls require a register to hold the target address. That
3031 // register must be:
3032 // * Allocatable (i.e. r0-r7 if the target is Thumb1).
3033 // * Not callee-saved, so must be one of r0-r3 or r12.
3034 // * Not used to hold an argument to the tail-called function, which might be
3035 // in r0-r3.
3036 // * Not used to hold the return address authentication code, which is in r12
3037 // if enabled.
3038 // Sometimes, no register matches all of these conditions, so we can't do a
3039 // tail-call.
3040 if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
3041 SmallSet<MCPhysReg, 5> AddressRegisters;
3042 for (Register R : {ARM::R0, ARM::R1, ARM::R2, ARM::R3})
3043 AddressRegisters.insert(R);
3044 if (!(Subtarget->isThumb1Only() ||
3046 AddressRegisters.insert(ARM::R12);
3047 for (const CCValAssign &AL : ArgLocs)
3048 if (AL.isRegLoc())
3049 AddressRegisters.erase(AL.getLocReg());
3050 if (AddressRegisters.empty())
3051 return false;
3052 }
3053
3054 // Look for obvious safe cases to perform tail call optimization that do not
3055 // require ABI changes. This is what gcc calls sibcall.
3056
3057 // Exception-handling functions need a special set of instructions to indicate
3058 // a return to the hardware. Tail-calling another function would probably
3059 // break this.
3060 if (CallerF.hasFnAttribute("interrupt"))
3061 return false;
3062
3063 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
3064 return CalleeCC == CallerCC;
3065
3066 // Also avoid sibcall optimization if either caller or callee uses struct
3067 // return semantics.
3068 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
3069 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
3070 if (isCalleeStructRet || isCallerStructRet)
3071 return false;
3072
3073 // Externally-defined functions with weak linkage should not be
3074 // tail-called on ARM when the OS does not support dynamic
3075 // pre-emption of symbols, as the AAELF spec requires normal calls
3076 // to undefined weak functions to be replaced with a NOP or jump to the
3077 // next instruction. The behaviour of branch instructions in this
3078 // situation (as used for tail calls) is implementation-defined, so we
3079 // cannot rely on the linker replacing the tail call with a return.
3081 const GlobalValue *GV = G->getGlobal();
3083 if (GV->hasExternalWeakLinkage() &&
3084 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3085 return false;
3086 }
3087
3088 // Check that the call results are passed in the same way.
3089 LLVMContext &C = *DAG.getContext();
3091 getEffectiveCallingConv(CalleeCC, isVarArg),
3092 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
3093 CCAssignFnForReturn(CalleeCC, isVarArg),
3094 CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))
3095 return false;
3096 // The callee has to preserve all registers the caller needs to preserve.
3097 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3098 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3099 if (CalleeCC != CallerCC) {
3100 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3101 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3102 return false;
3103 }
3104
3105 // If Caller's vararg or byval argument has been split between registers and
3106 // stack, do not perform tail call, since part of the argument is in caller's
3107 // local frame.
3108 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
3109 if (AFI_Caller->getArgRegsSaveSize())
3110 return false;
3111
3112 // If the callee takes no arguments then go on to check the results of the
3113 // call.
3114 if (!Outs.empty()) {
3115 if (CCInfo.getStackSize()) {
3116 // Check if the arguments are already laid out in the right way as
3117 // the caller's fixed stack objects.
3118 MachineFrameInfo &MFI = MF.getFrameInfo();
3119 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3120 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3121 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
3122 i != e;
3123 ++i, ++realArgIdx) {
3124 CCValAssign &VA = ArgLocs[i];
3125 EVT RegVT = VA.getLocVT();
3126 SDValue Arg = OutVals[realArgIdx];
3127 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3129 return false;
3130 if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
3131 // f64 and vector types are split into multiple registers or
3132 // register/stack-slot combinations. The types will not match
3133 // the registers; give up on memory f64 refs until we figure
3134 // out what to do about this.
3135 if (!VA.isRegLoc())
3136 return false;
3137 if (!ArgLocs[++i].isRegLoc())
3138 return false;
3139 if (RegVT == MVT::v2f64) {
3140 if (!ArgLocs[++i].isRegLoc())
3141 return false;
3142 if (!ArgLocs[++i].isRegLoc())
3143 return false;
3144 }
3145 } else if (!VA.isRegLoc()) {
3146 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3147 MFI, MRI, TII))
3148 return false;
3149 }
3150 }
3151 }
3152
3153 const MachineRegisterInfo &MRI = MF.getRegInfo();
3154 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3155 return false;
3156 }
3157
3158 return true;
3159}
3160
3161bool
3162ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
3163 MachineFunction &MF, bool isVarArg,
3165 LLVMContext &Context) const {
3167 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3168 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3169}
3170
3172 const SDLoc &DL, SelectionDAG &DAG) {
3173 const MachineFunction &MF = DAG.getMachineFunction();
3174 const Function &F = MF.getFunction();
3175
3176 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
3177
3178 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3179 // version of the "preferred return address". These offsets affect the return
3180 // instruction if this is a return from PL1 without hypervisor extensions.
3181 // IRQ/FIQ: +4 "subs pc, lr, #4"
3182 // SWI: 0 "subs pc, lr, #0"
3183 // ABORT: +4 "subs pc, lr, #4"
3184 // UNDEF: +4/+2 "subs pc, lr, #0"
3185 // UNDEF varies depending on where the exception came from ARM or Thumb
3186 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3187
3188 int64_t LROffset;
3189 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
3190 IntKind == "ABORT")
3191 LROffset = 4;
3192 else if (IntKind == "SWI" || IntKind == "UNDEF")
3193 LROffset = 0;
3194 else
3195 report_fatal_error("Unsupported interrupt attribute. If present, value "
3196 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3197
3198 RetOps.insert(RetOps.begin() + 1,
3199 DAG.getConstant(LROffset, DL, MVT::i32, false));
3200
3201 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
3202}
3203
3204SDValue
3205ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3206 bool isVarArg,
3208 const SmallVectorImpl<SDValue> &OutVals,
3209 const SDLoc &dl, SelectionDAG &DAG) const {
3210 // CCValAssign - represent the assignment of the return value to a location.
3212
3213 // CCState - Info about the registers and stack slots.
3214 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3215 *DAG.getContext());
3216
3217 // Analyze outgoing return values.
3218 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3219
3220 SDValue Glue;
3222 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3223 bool isLittleEndian = Subtarget->isLittle();
3224
3227 AFI->setReturnRegsCount(RVLocs.size());
3228
3229 // Report error if cmse entry function returns structure through first ptr arg.
3230 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
3231 // Note: using an empty SDLoc(), as the first line of the function is a
3232 // better place to report than the last line.
3235 "secure entry function would return value through pointer",
3236 SDLoc().getDebugLoc());
3237 DAG.getContext()->diagnose(Diag);
3238 }
3239
3240 // Copy the result values into the output registers.
3241 for (unsigned i = 0, realRVLocIdx = 0;
3242 i != RVLocs.size();
3243 ++i, ++realRVLocIdx) {
3244 CCValAssign &VA = RVLocs[i];
3245 assert(VA.isRegLoc() && "Can only return in registers!");
3246
3247 SDValue Arg = OutVals[realRVLocIdx];
3248 bool ReturnF16 = false;
3249
3250 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
3251 // Half-precision return values can be returned like this:
3252 //
3253 // t11 f16 = fadd ...
3254 // t12: i16 = bitcast t11
3255 // t13: i32 = zero_extend t12
3256 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3257 //
3258 // to avoid code generation for bitcasts, we simply set Arg to the node
3259 // that produces the f16 value, t11 in this case.
3260 //
3261 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3262 SDValue ZE = Arg.getOperand(0);
3263 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3264 SDValue BC = ZE.getOperand(0);
3265 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3266 Arg = BC.getOperand(0);
3267 ReturnF16 = true;
3268 }
3269 }
3270 }
3271 }
3272
3273 switch (VA.getLocInfo()) {
3274 default: llvm_unreachable("Unknown loc info!");
3275 case CCValAssign::Full: break;
3276 case CCValAssign::BCvt:
3277 if (!ReturnF16)
3278 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3279 break;
3280 }
3281
3282 // Mask f16 arguments if this is a CMSE nonsecure entry.
3283 auto RetVT = Outs[realRVLocIdx].ArgVT;
3284 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3285 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3286 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3287 } else {
3288 auto LocBits = VA.getLocVT().getSizeInBits();
3289 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3290 SDValue Mask =
3291 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
3292 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
3293 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3294 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3295 }
3296 }
3297
3298 if (VA.needsCustom() &&
3299 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3300 if (VA.getLocVT() == MVT::v2f64) {
3301 // Extract the first half and return it in two registers.
3302 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3303 DAG.getConstant(0, dl, MVT::i32));
3304 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3305 DAG.getVTList(MVT::i32, MVT::i32), Half);
3306
3307 Chain =
3308 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3309 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3310 Glue = Chain.getValue(1);
3311 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3312 VA = RVLocs[++i]; // skip ahead to next loc
3313 Chain =
3314 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3315 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3316 Glue = Chain.getValue(1);
3317 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3318 VA = RVLocs[++i]; // skip ahead to next loc
3319
3320 // Extract the 2nd half and fall through to handle it as an f64 value.
3321 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3322 DAG.getConstant(1, dl, MVT::i32));
3323 }
3324 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3325 // available.
3326 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3327 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3328 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3329 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3330 Glue = Chain.getValue(1);
3331 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3332 VA = RVLocs[++i]; // skip ahead to next loc
3333 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3334 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3335 } else
3336 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3337
3338 // Guarantee that all emitted copies are
3339 // stuck together, avoiding something bad.
3340 Glue = Chain.getValue(1);
3341 RetOps.push_back(DAG.getRegister(
3342 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3343 }
3344 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3345 const MCPhysReg *I =
3346 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3347 if (I) {
3348 for (; *I; ++I) {
3349 if (ARM::GPRRegClass.contains(*I))
3350 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3351 else if (ARM::DPRRegClass.contains(*I))
3353 else
3354 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3355 }
3356 }
3357
3358 // Update chain and glue.
3359 RetOps[0] = Chain;
3360 if (Glue.getNode())
3361 RetOps.push_back(Glue);
3362
3363 // CPUs which aren't M-class use a special sequence to return from
3364 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3365 // though we use "subs pc, lr, #N").
3366 //
3367 // M-class CPUs actually use a normal return sequence with a special
3368 // (hardware-provided) value in LR, so the normal code path works.
3369 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3370 !Subtarget->isMClass()) {
3371 if (Subtarget->isThumb1Only())
3372 report_fatal_error("interrupt attribute is not supported in Thumb1");
3373 return LowerInterruptReturn(RetOps, dl, DAG);
3374 }
3375
3378 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3379}
3380
3381bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3382 if (N->getNumValues() != 1)
3383 return false;
3384 if (!N->hasNUsesOfValue(1, 0))
3385 return false;
3386
3387 SDValue TCChain = Chain;
3388 SDNode *Copy = *N->use_begin();
3389 if (Copy->getOpcode() == ISD::CopyToReg) {
3390 // If the copy has a glue operand, we conservatively assume it isn't safe to
3391 // perform a tail call.
3392 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3393 return false;
3394 TCChain = Copy->getOperand(0);
3395 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3396 SDNode *VMov = Copy;
3397 // f64 returned in a pair of GPRs.
3399 for (SDNode *U : VMov->uses()) {
3400 if (U->getOpcode() != ISD::CopyToReg)
3401 return false;
3402 Copies.insert(U);
3403 }
3404 if (Copies.size() > 2)
3405 return false;
3406
3407 for (SDNode *U : VMov->uses()) {
3408 SDValue UseChain = U->getOperand(0);
3409 if (Copies.count(UseChain.getNode()))
3410 // Second CopyToReg
3411 Copy = U;
3412 else {
3413 // We are at the top of this chain.
3414 // If the copy has a glue operand, we conservatively assume it
3415 // isn't safe to perform a tail call.
3416 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3417 return false;
3418 // First CopyToReg
3419 TCChain = UseChain;
3420 }
3421 }
3422 } else if (Copy->getOpcode() == ISD::BITCAST) {
3423 // f32 returned in a single GPR.
3424 if (!Copy->hasOneUse())
3425 return false;
3426 Copy = *Copy->use_begin();
3427 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3428 return false;
3429 // If the copy has a glue operand, we conservatively assume it isn't safe to
3430 // perform a tail call.
3431 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3432 return false;
3433 TCChain = Copy->getOperand(0);
3434 } else {
3435 return false;
3436 }
3437
3438 bool HasRet = false;
3439 for (const SDNode *U : Copy->uses()) {
3440 if (U->getOpcode() != ARMISD::RET_GLUE &&
3441 U->getOpcode() != ARMISD::INTRET_GLUE)
3442 return false;
3443 HasRet = true;
3444 }
3445
3446 if (!HasRet)
3447 return false;
3448
3449 Chain = TCChain;
3450 return true;
3451}
3452
3453bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3454 if (!Subtarget->supportsTailCall())
3455 return false;
3456
3457 if (!CI->isTailCall())
3458 return false;
3459
3460 return true;
3461}
3462
3463// Trying to write a 64 bit value so need to split into two 32 bit values first,
3464// and pass the lower and high parts through.
3466 SDLoc DL(Op);
3467 SDValue WriteValue = Op->getOperand(2);
3468
3469 // This function is only supposed to be called for i64 type argument.
3470 assert(WriteValue.getValueType() == MVT::i64
3471 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3472
3473 SDValue Lo, Hi;
3474 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3475 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3476 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3477}
3478
3479// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3480// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3481// one of the above mentioned nodes. It has to be wrapped because otherwise
3482// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3483// be used to form addressing mode. These wrapped nodes will be selected
3484// into MOVi.
3485SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3486 SelectionDAG &DAG) const {
3487 EVT PtrVT = Op.getValueType();
3488 // FIXME there is no actual debug info here
3489 SDLoc dl(Op);
3491 SDValue Res;
3492
3493 // When generating execute-only code Constant Pools must be promoted to the
3494 // global data section. It's a bit ugly that we can't share them across basic
3495 // blocks, but this way we guarantee that execute-only behaves correct with
3496 // position-independent addressing modes.
3497 if (Subtarget->genExecuteOnly()) {
3498 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3499 auto T = const_cast<Type*>(CP->getType());
3500 auto C = const_cast<Constant*>(CP->getConstVal());
3501 auto M = const_cast<Module*>(DAG.getMachineFunction().
3503 auto GV = new GlobalVariable(
3504 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3507 Twine(AFI->createPICLabelUId())
3508 );
3510 dl, PtrVT);
3511 return LowerGlobalAddress(GA, DAG);
3512 }
3513
3514 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3515 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3516 Align CPAlign = CP->getAlign();
3517 if (Subtarget->isThumb1Only())
3518 CPAlign = std::max(CPAlign, Align(4));
3519 if (CP->isMachineConstantPoolEntry())
3520 Res =
3521 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3522 else
3523 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3524 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3525}
3526
3528 // If we don't have a 32-bit pc-relative branch instruction then the jump
3529 // table consists of block addresses. Usually this is inline, but for
3530 // execute-only it must be placed out-of-line.
3531 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3534}
3535
3536SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3537 SelectionDAG &DAG) const {
3540 unsigned ARMPCLabelIndex = 0;
3541 SDLoc DL(Op);
3542 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3543 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3544 SDValue CPAddr;
3545 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3546 if (!IsPositionIndependent) {
3547 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3548 } else {
3549 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3550 ARMPCLabelIndex = AFI->createPICLabelUId();
3552 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3553 ARMCP::CPBlockAddress, PCAdj);
3554 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3555 }
3556 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3557 SDValue Result = DAG.getLoad(
3558 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3560 if (!IsPositionIndependent)
3561 return Result;
3562 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3563 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3564}
3565
3566/// Convert a TLS address reference into the correct sequence of loads
3567/// and calls to compute the variable's address for Darwin, and return an
3568/// SDValue containing the final node.
3569
3570/// Darwin only has one TLS scheme which must be capable of dealing with the
3571/// fully general situation, in the worst case. This means:
3572/// + "extern __thread" declaration.
3573/// + Defined in a possibly unknown dynamic library.
3574///
3575/// The general system is that each __thread variable has a [3 x i32] descriptor
3576/// which contains information used by the runtime to calculate the address. The
3577/// only part of this the compiler needs to know about is the first word, which
3578/// contains a function pointer that must be called with the address of the
3579/// entire descriptor in "r0".
3580///
3581/// Since this descriptor may be in a different unit, in general access must
3582/// proceed along the usual ARM rules. A common sequence to produce is:
3583///
3584/// movw rT1, :lower16:_var$non_lazy_ptr
3585/// movt rT1, :upper16:_var$non_lazy_ptr
3586/// ldr r0, [rT1]
3587/// ldr rT2, [r0]
3588/// blx rT2
3589/// [...address now in r0...]
3590SDValue
3591ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3592 SelectionDAG &DAG) const {
3593 assert(Subtarget->isTargetDarwin() &&
3594 "This function expects a Darwin target");
3595 SDLoc DL(Op);
3596
3597 // First step is to get the address of the actua global symbol. This is where
3598 // the TLS descriptor lives.
3599 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3600
3601 // The first entry in the descriptor is a function pointer that we must call
3602 // to obtain the address of the variable.
3603 SDValue Chain = DAG.getEntryNode();
3604 SDValue FuncTLVGet = DAG.getLoad(
3605 MVT::i32, DL, Chain, DescAddr,
3609 Chain = FuncTLVGet.getValue(1);
3610
3612 MachineFrameInfo &MFI = F.getFrameInfo();
3613 MFI.setAdjustsStack(true);
3614
3615 // TLS calls preserve all registers except those that absolutely must be
3616 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3617 // silly).
3618 auto TRI =
3620 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3622
3623 // Finally, we can make the call. This is just a degenerate version of a
3624 // normal AArch64 call node: r0 takes the address of the descriptor, and
3625 // returns the address of the variable in this thread.
3626 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3627 Chain =
3628 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3629 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3630 DAG.getRegisterMask(Mask), Chain.getValue(1));
3631 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3632}
3633
3634SDValue
3635ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3636 SelectionDAG &DAG) const {
3637 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3638
3639 SDValue Chain = DAG.getEntryNode();
3640 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3641 SDLoc DL(Op);
3642
3643 // Load the current TEB (thread environment block)
3644 SDValue Ops[] = {Chain,
3645 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3646 DAG.getTargetConstant(15, DL, MVT::i32),
3647 DAG.getTargetConstant(0, DL, MVT::i32),
3648 DAG.getTargetConstant(13, DL, MVT::i32),
3649 DAG.getTargetConstant(0, DL, MVT::i32),
3650 DAG.getTargetConstant(2, DL, MVT::i32)};
3651 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3652 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3653
3654 SDValue TEB = CurrentTEB.getValue(0);
3655 Chain = CurrentTEB.getValue(1);
3656
3657 // Load the ThreadLocalStoragePointer from the TEB
3658 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3659 SDValue TLSArray =
3660 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3661 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3662
3663 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3664 // offset into the TLSArray.
3665
3666 // Load the TLS index from the C runtime
3667 SDValue TLSIndex =
3668 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3669 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3670 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3671
3672 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3673 DAG.getConstant(2, DL, MVT::i32));
3674 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3675 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3677
3678 // Get the offset of the start of the .tls section (section base)
3679 const auto *GA = cast<GlobalAddressSDNode>(Op);
3680 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3681 SDValue Offset = DAG.getLoad(
3682 PtrVT, DL, Chain,
3683 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3684 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3686
3687 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3688}
3689
3690// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3691SDValue
3692ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3693 SelectionDAG &DAG) const {
3694 SDLoc dl(GA);
3695 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3696 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3699 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3701 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3702 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3703 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3704 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3705 Argument = DAG.getLoad(
3706 PtrVT, dl, DAG.getEntryNode(), Argument,
3708 SDValue Chain = Argument.getValue(1);
3709
3710 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3711 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3712
3713 // call __tls_get_addr.
3714 ArgListTy Args;
3715 ArgListEntry Entry;
3716 Entry.Node = Argument;
3717 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
3718 Args.push_back(Entry);
3719
3720 // FIXME: is there useful debug info available here?
3722 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3724 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3725
3726 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3727 return CallResult.first;
3728}
3729
3730// Lower ISD::GlobalTLSAddress using the "initial exec" or
3731// "local exec" model.
3732SDValue
3733ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3734 SelectionDAG &DAG,
3735 TLSModel::Model model) const {
3736 const GlobalValue *GV = GA->getGlobal();
3737 SDLoc dl(GA);
3739 SDValue Chain = DAG.getEntryNode();
3740 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3741 // Get the Thread Pointer
3743
3744 if (model == TLSModel::InitialExec) {
3747 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3748 // Initial exec model.
3749 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3751 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3753 true);
3754 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3755 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3756 Offset = DAG.getLoad(
3757 PtrVT, dl, Chain, Offset,
3759 Chain = Offset.getValue(1);
3760
3761 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3762 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3763
3764 Offset = DAG.getLoad(
3765 PtrVT, dl, Chain, Offset,
3767 } else {
3768 // local exec model
3769 assert(model == TLSModel::LocalExec);
3772 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3773 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3774 Offset = DAG.getLoad(
3775 PtrVT, dl, Chain, Offset,
3777 }
3778
3779 // The address of the thread local variable is the add of the thread
3780 // pointer with the offset of the variable.
3781 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3782}
3783
3784SDValue
3785ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3787 if (DAG.getTarget().useEmulatedTLS())
3788 return LowerToTLSEmulatedModel(GA, DAG);
3789
3790 if (Subtarget->isTargetDarwin())
3791 return LowerGlobalTLSAddressDarwin(Op, DAG);
3792
3793 if (Subtarget->isTargetWindows())
3794 return LowerGlobalTLSAddressWindows(Op, DAG);
3795
3796 // TODO: implement the "local dynamic" model
3797 assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3799
3800 switch (model) {
3803 return LowerToTLSGeneralDynamicModel(GA, DAG);
3806 return LowerToTLSExecModels(GA, DAG, model);
3807 }
3808 llvm_unreachable("bogus TLS model");
3809}
3810
3811/// Return true if all users of V are within function F, looking through
3812/// ConstantExprs.
3813static bool allUsersAreInFunction(const Value *V, const Function *F) {
3814 SmallVector<const User*,4> Worklist(V->users());
3815 while (!Worklist.empty()) {
3816 auto *U = Worklist.pop_back_val();
3817 if (isa<ConstantExpr>(U)) {
3818 append_range(Worklist, U->users());
3819 continue;
3820 }
3821
3822 auto *I = dyn_cast<Instruction>(U);
3823 if (!I || I->getParent()->getParent() != F)
3824 return false;
3825 }
3826 return true;
3827}
3828
3830 const GlobalValue *GV, SelectionDAG &DAG,
3831 EVT PtrVT, const SDLoc &dl) {
3832 // If we're creating a pool entry for a constant global with unnamed address,
3833 // and the global is small enough, we can emit it inline into the constant pool
3834 // to save ourselves an indirection.
3835 //
3836 // This is a win if the constant is only used in one function (so it doesn't
3837 // need to be duplicated) or duplicating the constant wouldn't increase code
3838 // size (implying the constant is no larger than 4 bytes).
3839 const Function &F = DAG.getMachineFunction().getFunction();
3840
3841 // We rely on this decision to inline being idemopotent and unrelated to the
3842 // use-site. We know that if we inline a variable at one use site, we'll
3843 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3844 // doesn't know about this optimization, so bail out if it's enabled else
3845 // we could decide to inline here (and thus never emit the GV) but require
3846 // the GV from fast-isel generated code.
3849 return SDValue();
3850
3851 auto *GVar = dyn_cast<GlobalVariable>(GV);
3852 if (!GVar || !GVar->hasInitializer() ||
3853 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3854 !GVar->hasLocalLinkage())
3855 return SDValue();
3856
3857 // If we inline a value that contains relocations, we move the relocations
3858 // from .data to .text. This is not allowed in position-independent code.
3859 auto *Init = GVar->getInitializer();
3860 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3861 Init->needsDynamicRelocation())
3862 return SDValue();
3863
3864 // The constant islands pass can only really deal with alignment requests
3865 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3866 // any type wanting greater alignment requirements than 4 bytes. We also
3867 // can only promote constants that are multiples of 4 bytes in size or
3868 // are paddable to a multiple of 4. Currently we only try and pad constants
3869 // that are strings for simplicity.
3870 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3871 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3872 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3873 unsigned RequiredPadding = 4 - (Size % 4);
3874 bool PaddingPossible =
3875 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3876 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3877 Size == 0)
3878 return SDValue();
3879
3880 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3882 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3883
3884 // We can't bloat the constant pool too much, else the ConstantIslands pass
3885 // may fail to converge. If we haven't promoted this global yet (it may have
3886 // multiple uses), and promoting it would increase the constant pool size (Sz
3887 // > 4), ensure we have space to do so up to MaxTotal.
3888 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3889 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3891 return SDValue();
3892
3893 // This is only valid if all users are in a single function; we can't clone
3894 // the constant in general. The LLVM IR unnamed_addr allows merging
3895 // constants, but not cloning them.
3896 //
3897 // We could potentially allow cloning if we could prove all uses of the
3898 // constant in the current function don't care about the address, like
3899 // printf format strings. But that isn't implemented for now.
3900 if (!allUsersAreInFunction(GVar, &F))
3901 return SDValue();
3902
3903 // We're going to inline this global. Pad it out if needed.
3904 if (RequiredPadding != 4) {
3905 StringRef S = CDAInit->getAsString();
3906
3908 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3909 while (RequiredPadding--)
3910 V.push_back(0);
3912 }
3913
3914 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3915 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3916 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3919 PaddedSize - 4);
3920 }
3921 ++NumConstpoolPromoted;
3922 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3923}
3924
3926 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3927 if (!(GV = GA->getAliaseeObject()))
3928 return false;
3929 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3930 return V->isConstant();
3931 return isa<Function>(GV);
3932}
3933
3934SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3935 SelectionDAG &DAG) const {
3936 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3937 default: llvm_unreachable("unknown object format");
3938 case Triple::COFF:
3939 return LowerGlobalAddressWindows(Op, DAG);
3940 case Triple::ELF:
3941 return LowerGlobalAddressELF(Op, DAG);
3942 case Triple::MachO:
3943 return LowerGlobalAddressDarwin(Op, DAG);
3944 }
3945}
3946
3947SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3948 SelectionDAG &DAG) const {
3949 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3950 SDLoc dl(Op);
3951 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3952 bool IsRO = isReadOnly(GV);
3953
3954 // promoteToConstantPool only if not generating XO text section
3955 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3956 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3957 return V;
3958
3959 if (isPositionIndependent()) {
3961 GV, dl, PtrVT, 0, GV->isDSOLocal() ? 0 : ARMII::MO_GOT);
3962 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3963 if (!GV->isDSOLocal())
3964 Result =
3965 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3967 return Result;
3968 } else if (Subtarget->isROPI() && IsRO) {
3969 // PC-relative.
3970 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3971 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3972 return Result;
3973 } else if (Subtarget->isRWPI() && !IsRO) {
3974 // SB-relative.
3975 SDValue RelAddr;
3976 if (Subtarget->useMovt()) {
3977 ++NumMovwMovt;
3978 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3979 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3980 } else { // use literal pool for address constant
3983 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3984 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3985 RelAddr = DAG.getLoad(
3986 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3988 }
3989 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3990 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3991 return Result;
3992 }
3993
3994 // If we have T2 ops, we can materialize the address directly via movt/movw
3995 // pair. This is always cheaper. If need to generate Execute Only code, and we
3996 // only have Thumb1 available, we can't use a constant pool and are forced to
3997 // use immediate relocations.
3998 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3999 if (Subtarget->useMovt())
4000 ++NumMovwMovt;
4001 // FIXME: Once remat is capable of dealing with instructions with register
4002 // operands, expand this into two nodes.
4003 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
4004 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
4005 } else {
4006 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
4007 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
4008 return DAG.getLoad(
4009 PtrVT, dl, DAG.getEntryNode(), CPAddr,
4011 }
4012}
4013
4014SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
4015 SelectionDAG &DAG) const {
4016 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
4017 "ROPI/RWPI not currently supported for Darwin");
4018 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4019 SDLoc dl(Op);
4020 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
4021
4022 if (Subtarget->useMovt())
4023 ++NumMovwMovt;
4024
4025 // FIXME: Once remat is capable of dealing with instructions with register
4026 // operands, expand this into multiple nodes
4027 unsigned Wrapper =
4029
4030 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
4031 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
4032
4033 if (Subtarget->isGVIndirectSymbol(GV))
4034 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
4036 return Result;
4037}
4038
4039SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
4040 SelectionDAG &DAG) const {
4041 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
4042 assert(Subtarget->useMovt() &&
4043 "Windows on ARM expects to use movw/movt");
4044 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
4045 "ROPI/RWPI not currently supported for Windows");
4046
4048 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
4049 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
4050 if (GV->hasDLLImportStorageClass())
4051 TargetFlags = ARMII::MO_DLLIMPORT;
4052 else if (!TM.shouldAssumeDSOLocal(GV))
4053 TargetFlags = ARMII::MO_COFFSTUB;
4054 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4056 SDLoc DL(Op);
4057
4058 ++NumMovwMovt;
4059
4060 // FIXME: Once remat is capable of dealing with instructions with register
4061 // operands, expand this into two nodes.
4062 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
4063 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
4064 TargetFlags));
4065 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
4066 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
4068 return Result;
4069}
4070
4071SDValue
4072ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
4073 SDLoc dl(Op);
4074 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
4075 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
4076 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
4077 Op.getOperand(1), Val);
4078}
4079
4080SDValue
4081ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
4082 SDLoc dl(Op);
4083 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
4084 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
4085}
4086
4087SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
4088 SelectionDAG &DAG) const {
4089 SDLoc dl(Op);
4090 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
4091 Op.getOperand(0));
4092}
4093
4094SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
4095 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
4096 unsigned IntNo =
4097 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
4098 switch (IntNo) {
4099 default:
4100 return SDValue(); // Don't custom lower most intrinsics.
4101 case Intrinsic::arm_gnu_eabi_mcount: {
4103 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4104 SDLoc dl(Op);
4105 SDValue Chain = Op.getOperand(0);
4106 // call "\01__gnu_mcount_nc"
4107 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
4108 const uint32_t *Mask =
4110 assert(Mask && "Missing call preserved mask for calling convention");
4111 // Mark LR an implicit live-in.
4112 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
4113 SDValue ReturnAddress =
4114 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
4115 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
4116 SDValue Callee =
4117 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
4119 if (Subtarget->isThumb())
4120 return SDValue(
4121 DAG.getMachineNode(
4122 ARM::tBL_PUSHLR, dl, ResultTys,
4123 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
4124 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
4125 0);
4126 return SDValue(
4127 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
4128 {ReturnAddress, Callee, RegisterMask, Chain}),
4129 0);
4130 }
4131 }
4132}
4133
4134SDValue
4135ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
4136 const ARMSubtarget *Subtarget) const {
4137 unsigned IntNo = Op.getConstantOperandVal(0);
4138 SDLoc dl(Op);
4139 switch (IntNo) {
4140 default: return SDValue(); // Don't custom lower most intrinsics.
4141 case Intrinsic::thread_pointer: {
4142 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4143 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
4144 }
4145 case Intrinsic::arm_cls: {
4146 const SDValue &Operand = Op.getOperand(1);
4147 const EVT VTy = Op.getValueType();
4148 SDValue SRA =
4149 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
4150 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
4151 SDValue SHL =
4152 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
4153 SDValue OR =
4154 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
4155 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
4156 return Result;
4157 }
4158 case Intrinsic::arm_cls64: {
4159 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
4160 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
4161 const SDValue &Operand = Op.getOperand(1);
4162 const EVT VTy = Op.getValueType();
4163 SDValue Lo, Hi;
4164 std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VTy, VTy);
4165 SDValue Constant0 = DAG.getConstant(0, dl, VTy);
4166 SDValue Constant1 = DAG.getConstant(1, dl, VTy);
4167 SDValue Constant31 = DAG.getConstant(31, dl, VTy);
4168 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
4169 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
4170 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
4171 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
4172 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
4173 SDValue CheckLo =
4174 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
4175 SDValue HiIsZero =
4176 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
4177 SDValue AdjustedLo =
4178 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
4179 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
4180 SDValue Result =
4181 DAG.getSelect(dl, VTy, CheckLo,
4182 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
4183 return Result;
4184 }
4185 case Intrinsic::eh_sjlj_lsda: {
4188 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
4189 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4190 SDValue CPAddr;
4191 bool IsPositionIndependent = isPositionIndependent();
4192 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
4194 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
4195 ARMCP::CPLSDA, PCAdj);
4196 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
4197 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
4198 SDValue Result = DAG.getLoad(
4199 PtrVT, dl, DAG.getEntryNode(), CPAddr,
4201
4202 if (IsPositionIndependent) {
4203 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
4204 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
4205 }
4206 return Result;
4207 }
4208 case Intrinsic::arm_neon_vabs:
4209 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
4210 Op.getOperand(1));
4211 case Intrinsic::arm_neon_vabds:
4212 if (Op.getValueType().isInteger())
4213 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
4214 Op.getOperand(1), Op.getOperand(2));
4215 return SDValue();
4216 case Intrinsic::arm_neon_vabdu:
4217 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
4218 Op.getOperand(1), Op.getOperand(2));
4219 case Intrinsic::arm_neon_vmulls:
4220 case Intrinsic::arm_neon_vmullu: {
4221 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
4222 ? ARMISD::VMULLs : ARMISD::VMULLu;
4223 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4224 Op.getOperand(1), Op.getOperand(2));
4225 }
4226 case Intrinsic::arm_neon_vminnm:
4227 case Intrinsic::arm_neon_vmaxnm: {
4228 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
4229 ? ISD::FMINNUM : ISD::FMAXNUM;
4230 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4231 Op.getOperand(1), Op.getOperand(2));
4232 }
4233 case Intrinsic::arm_neon_vminu:
4234 case Intrinsic::arm_neon_vmaxu: {
4235 if (Op.getValueType().isFloatingPoint())
4236 return SDValue();
4237 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
4238 ? ISD::UMIN : ISD::UMAX;
4239 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4240 Op.getOperand(1), Op.getOperand(2));
4241 }
4242 case Intrinsic::arm_neon_vmins:
4243 case Intrinsic::arm_neon_vmaxs: {
4244 // v{min,max}s is overloaded between signed integers and floats.
4245 if (!Op.getValueType().isFloatingPoint()) {
4246 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4247 ? ISD::SMIN : ISD::SMAX;
4248 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4249 Op.getOperand(1), Op.getOperand(2));
4250 }
4251 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4252 ? ISD::FMINIMUM : ISD::FMAXIMUM;
4253 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4254 Op.getOperand(1), Op.getOperand(2));
4255 }
4256 case Intrinsic::arm_neon_vtbl1:
4257 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
4258 Op.getOperand(1), Op.getOperand(2));
4259 case Intrinsic::arm_neon_vtbl2:
4260 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
4261 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4262 case Intrinsic::arm_mve_pred_i2v:
4263 case Intrinsic::arm_mve_pred_v2i:
4264 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
4265 Op.getOperand(1));
4266 case Intrinsic::arm_mve_vreinterpretq:
4267 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
4268 Op.getOperand(1));
4269 case Intrinsic::arm_mve_lsll:
4270 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
4271 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4272 case Intrinsic::arm_mve_asrl:
4273 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
4274 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4275 }
4276}
4277
4279 const ARMSubtarget *Subtarget) {
4280 SDLoc dl(Op);
4281 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
4282 if (SSID == SyncScope::SingleThread)
4283 return Op;
4284
4285 if (!Subtarget->hasDataBarrier()) {
4286 // Some ARMv6 cpus can support data barriers with an mcr instruction.
4287 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
4288 // here.
4289 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
4290 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
4291 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
4292 DAG.getConstant(0, dl, MVT::i32));
4293 }
4294
4295 AtomicOrdering Ord =
4296 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
4298 if (Subtarget->isMClass()) {
4299 // Only a full system barrier exists in the M-class architectures.
4301 } else if (Subtarget->preferISHSTBarriers() &&
4302 Ord == AtomicOrdering::Release) {
4303 // Swift happens to implement ISHST barriers in a way that's compatible with
4304 // Release semantics but weaker than ISH so we'd be fools not to use
4305 // it. Beware: other processors probably don't!
4307 }
4308
4309 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4310 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4311 DAG.getConstant(Domain, dl, MVT::i32));
4312}
4313
4315 const ARMSubtarget *Subtarget) {
4316 // ARM pre v5TE and Thumb1 does not have preload instructions.
4317 if (!(Subtarget->isThumb2() ||
4318 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4319 // Just preserve the chain.
4320 return Op.getOperand(0);
4321
4322 SDLoc dl(Op);
4323 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4324 if (!isRead &&
4325 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4326 // ARMv7 with MP extension has PLDW.
4327 return Op.getOperand(0);
4328
4329 unsigned isData = Op.getConstantOperandVal(4);
4330 if (Subtarget->isThumb()) {
4331 // Invert the bits.
4332 isRead = ~isRead & 1;
4333 isData = ~isData & 1;
4334 }
4335
4336 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4337 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4338 DAG.getConstant(isData, dl, MVT::i32));
4339}
4340
4343 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4344
4345 // vastart just stores the address of the VarArgsFrameIndex slot into the
4346 // memory location argument.
4347 SDLoc dl(Op);
4349 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4350 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4351 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4352 MachinePointerInfo(SV));
4353}
4354
4355SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4356 CCValAssign &NextVA,
4357 SDValue &Root,
4358 SelectionDAG &DAG,
4359 const SDLoc &dl) const {
4362
4363 const TargetRegisterClass *RC;
4364 if (AFI->isThumb1OnlyFunction())
4365 RC = &ARM::tGPRRegClass;
4366 else
4367 RC = &ARM::GPRRegClass;
4368
4369 // Transform the arguments stored in physical registers into virtual ones.
4370 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4371 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4372
4373 SDValue ArgValue2;
4374 if (NextVA.isMemLoc()) {
4375 MachineFrameInfo &MFI = MF.getFrameInfo();
4376 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4377
4378 // Create load node to retrieve arguments from the stack.
4379 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4380 ArgValue2 = DAG.getLoad(
4381 MVT::i32, dl, Root, FIN,
4383 } else {
4384 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4385 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4386 }
4387 if (!Subtarget->isLittle())
4388 std::swap (ArgValue, ArgValue2);
4389 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4390}
4391
4392// The remaining GPRs hold either the beginning of variable-argument
4393// data, or the beginning of an aggregate passed by value (usually
4394// byval). Either way, we allocate stack slots adjacent to the data
4395// provided by our caller, and store the unallocated registers there.
4396// If this is a variadic function, the va_list pointer will begin with
4397// these values; otherwise, this reassembles a (byval) structure that
4398// was split between registers and memory.
4399// Return: The frame index registers were stored into.
4400int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4401 const SDLoc &dl, SDValue &Chain,
4402 const Value *OrigArg,
4403 unsigned InRegsParamRecordIdx,
4404 int ArgOffset, unsigned ArgSize) const {
4405 // Currently, two use-cases possible:
4406 // Case #1. Non-var-args function, and we meet first byval parameter.
4407 // Setup first unallocated register as first byval register;
4408 // eat all remained registers
4409 // (these two actions are performed by HandleByVal method).
4410 // Then, here, we initialize stack frame with
4411 // "store-reg" instructions.
4412 // Case #2. Var-args function, that doesn't contain byval parameters.
4413 // The same: eat all remained unallocated registers,
4414 // initialize stack frame.
4415
4417 MachineFrameInfo &MFI = MF.getFrameInfo();
4419 unsigned RBegin, REnd;
4420 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4421 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4422 } else {
4423 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4424 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4425 REnd = ARM::R4;
4426 }
4427
4428 if (REnd != RBegin)
4429 ArgOffset = -4 * (ARM::R4 - RBegin);
4430
4431 auto PtrVT = getPointerTy(DAG.getDataLayout());
4432 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4433 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4434
4436 const TargetRegisterClass *RC =
4437 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4438
4439 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4440 Register VReg = MF.addLiveIn(Reg, RC);
4441 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4442 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4443 MachinePointerInfo(OrigArg, 4 * i));
4444 MemOps.push_back(Store);
4445 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4446 }
4447
4448 if (!MemOps.empty())
4449 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4450 return FrameIndex;
4451}
4452
4453// Setup stack frame, the va_list pointer will start from.
4454void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4455 const SDLoc &dl, SDValue &Chain,
4456 unsigned ArgOffset,
4457 unsigned TotalArgRegsSaveSize,
4458 bool ForceMutable) const {
4461
4462 // Try to store any remaining integer argument regs
4463 // to their spots on the stack so that they may be loaded by dereferencing
4464 // the result of va_next.
4465 // If there is no regs to be stored, just point address after last
4466 // argument passed via stack.
4467 int FrameIndex = StoreByValRegs(
4468 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4469 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4470 AFI->setVarArgsFrameIndex(FrameIndex);
4471}
4472
4473bool ARMTargetLowering::splitValueIntoRegisterParts(
4474 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4475 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4476 EVT ValueVT = Val.getValueType();
4477 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4478 unsigned ValueBits = ValueVT.getSizeInBits();
4479 unsigned PartBits = PartVT.getSizeInBits();
4480 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4481 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4482 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4483 Parts[0] = Val;
4484 return true;
4485 }
4486 return false;
4487}
4488
4489SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4490 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4491 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4492 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4493 unsigned ValueBits = ValueVT.getSizeInBits();
4494 unsigned PartBits = PartVT.getSizeInBits();
4495 SDValue Val = Parts[0];
4496
4497 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4498 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4499 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4500 return Val;
4501 }
4502 return SDValue();
4503}
4504
4505SDValue ARMTargetLowering::LowerFormalArguments(
4506 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4507 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4508 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4510 MachineFrameInfo &MFI = MF.getFrameInfo();
4511
4513
4514 // Assign locations to all of the incoming arguments.
4516 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4517 *DAG.getContext());
4518 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4519
4521 unsigned CurArgIdx = 0;
4522
4523 // Initially ArgRegsSaveSize is zero.
4524 // Then we increase this value each time we meet byval parameter.
4525 // We also increase this value in case of varargs function.
4526 AFI->setArgRegsSaveSize(0);
4527
4528 // Calculate the amount of stack space that we need to allocate to store
4529 // byval and variadic arguments that are passed in registers.
4530 // We need to know this before we allocate the first byval or variadic
4531 // argument, as they will be allocated a stack slot below the CFA (Canonical
4532 // Frame Address, the stack pointer at entry to the function).
4533 unsigned ArgRegBegin = ARM::R4;
4534 for (const CCValAssign &VA : ArgLocs) {
4535 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4536 break;
4537
4538 unsigned Index = VA.getValNo();
4539 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4540 if (!Flags.isByVal())
4541 continue;
4542
4543 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4544 unsigned RBegin, REnd;
4545 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4546 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4547
4548 CCInfo.nextInRegsParam();
4549 }
4550 CCInfo.rewindByValRegsInfo();
4551
4552 int lastInsIndex = -1;
4553 if (isVarArg && MFI.hasVAStart()) {
4554 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4555 if (RegIdx != std::size(GPRArgRegs))
4556 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4557 }
4558
4559 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4560 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4561 auto PtrVT = getPointerTy(DAG.getDataLayout());
4562
4563 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4564 CCValAssign &VA = ArgLocs[i];
4565 if (Ins[VA.getValNo()].isOrigArg()) {
4566 std::advance(CurOrigArg,
4567 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4568 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4569 }
4570 // Arguments stored in registers.
4571 if (VA.isRegLoc()) {
4572 EVT RegVT = VA.getLocVT();
4573 SDValue ArgValue;
4574
4575 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4576 // f64 and vector types are split up into multiple registers or
4577 // combinations of registers and stack slots.
4578 SDValue ArgValue1 =
4579 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4580 VA = ArgLocs[++i]; // skip ahead to next loc
4581 SDValue ArgValue2;
4582 if (VA.isMemLoc()) {
4583 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4584 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4585 ArgValue2 = DAG.getLoad(
4586 MVT::f64, dl, Chain, FIN,
4588 } else {
4589 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4590 }
4591 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4592 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4593 ArgValue1, DAG.getIntPtrConstant(0, dl));
4594 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4595 ArgValue2, DAG.getIntPtrConstant(1, dl));
4596 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4597 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4598 } else {
4599 const TargetRegisterClass *RC;
4600
4601 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4602 RC = &ARM::HPRRegClass;
4603 else if (RegVT == MVT::f32)
4604 RC = &ARM::SPRRegClass;
4605 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4606 RegVT == MVT::v4bf16)
4607 RC = &ARM::DPRRegClass;
4608 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4609 RegVT == MVT::v8bf16)
4610 RC = &ARM::QPRRegClass;
4611 else if (RegVT == MVT::i32)
4612 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4613 : &ARM::GPRRegClass;
4614 else
4615 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4616
4617 // Transform the arguments in physical registers into virtual ones.
4618 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4619 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4620
4621 // If this value is passed in r0 and has the returned attribute (e.g.
4622 // C++ 'structors), record this fact for later use.
4623 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4624 AFI->setPreservesR0();
4625 }
4626 }
4627
4628 // If this is an 8 or 16-bit value, it is really passed promoted
4629 // to 32 bits. Insert an assert[sz]ext to capture this, then
4630 // truncate to the right size.
4631 switch (VA.getLocInfo()) {
4632 default: llvm_unreachable("Unknown loc info!");
4633 case CCValAssign::Full: break;
4634 case CCValAssign::BCvt:
4635 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4636 break;
4637 }
4638
4639 // f16 arguments have their size extended to 4 bytes and passed as if they
4640 // had been copied to the LSBs of a 32-bit register.
4641 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4642 if (VA.needsCustom() &&
4643 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4644 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4645
4646 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4647 // less than 32 bits must be sign- or zero-extended in the callee for
4648 // security reasons. Although the ABI mandates an extension done by the
4649 // caller, the latter cannot be trusted to follow the rules of the ABI.
4650 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4651 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4652 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4653 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4654
4655 InVals.push_back(ArgValue);
4656 } else { // VA.isRegLoc()
4657 // Only arguments passed on the stack should make it here.
4658 assert(VA.isMemLoc());
4659 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4660
4661 int index = VA.getValNo();
4662
4663 // Some Ins[] entries become multiple ArgLoc[] entries.
4664 // Process them only once.
4665 if (index != lastInsIndex)
4666 {
4667 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4668 // FIXME: For now, all byval parameter objects are marked mutable.
4669 // This can be changed with more analysis.
4670 // In case of tail call optimization mark all arguments mutable.
4671 // Since they could be overwritten by lowering of arguments in case of
4672 // a tail call.
4673 if (Flags.isByVal()) {
4674 assert(Ins[index].isOrigArg() &&
4675 "Byval arguments cannot be implicit");
4676 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4677
4678 int FrameIndex = StoreByValRegs(
4679 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4680 VA.getLocMemOffset(), Flags.getByValSize());
4681 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4682 CCInfo.nextInRegsParam();
4683 } else {
4684 unsigned FIOffset = VA.getLocMemOffset();
4685 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4686 FIOffset, true);
4687
4688 // Create load nodes to retrieve arguments from the stack.
4689 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4690 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4692 DAG.getMachineFunction(), FI)));
4693 }
4694 lastInsIndex = index;
4695 }
4696 }
4697 }
4698
4699 // varargs
4700 if (isVarArg && MFI.hasVAStart()) {
4701 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4702 TotalArgRegsSaveSize);
4703 if (AFI->isCmseNSEntryFunction()) {
4706 "secure entry function must not be variadic", dl.getDebugLoc());
4707 DAG.getContext()->diagnose(Diag);
4708 }
4709 }
4710
4711 unsigned StackArgSize = CCInfo.getStackSize();
4712 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4713 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4714 // The only way to guarantee a tail call is if the callee restores its
4715 // argument area, but it must also keep the stack aligned when doing so.
4716 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
4717 assert(StackAlign && "data layout string is missing stack alignment");
4718 StackArgSize = alignTo(StackArgSize, *StackAlign);
4719
4720 AFI->setArgumentStackToRestore(StackArgSize);
4721 }
4722 AFI->setArgumentStackSize(StackArgSize);
4723
4724 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4727 "secure entry function requires arguments on stack", dl.getDebugLoc());
4728 DAG.getContext()->diagnose(Diag);
4729 }
4730
4731 return Chain;
4732}
4733
4734/// isFloatingPointZero - Return true if this is +0.0.
4737 return CFP->getValueAPF().isPosZero();
4738 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4739 // Maybe this has already been legalized into the constant pool?
4740 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4741 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4743 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4744 return CFP->getValueAPF().isPosZero();
4745 }
4746 } else if (Op->getOpcode() == ISD::BITCAST &&
4747 Op->getValueType(0) == MVT::f64) {
4748 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4749 // created by LowerConstantFP().
4750 SDValue BitcastOp = Op->getOperand(0);
4751 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4752 isNullConstant(BitcastOp->getOperand(0)))
4753 return true;
4754 }
4755 return false;
4756}
4757
4758/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4759/// the given operands.
4760SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4761 SDValue &ARMcc, SelectionDAG &DAG,
4762 const SDLoc &dl) const {
4763 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4764 unsigned C = RHSC->getZExtValue();
4765 if (!isLegalICmpImmediate((int32_t)C)) {
4766 // Constant does not fit, try adjusting it by one.
4767 switch (CC) {
4768 default: break;
4769 case ISD::SETLT:
4770 case ISD::SETGE:
4771 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4772 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4773 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4774 }
4775 break;
4776 case ISD::SETULT:
4777 case ISD::SETUGE:
4778 if (C != 0 && isLegalICmpImmediate(C-1)) {
4779 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4780 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4781 }
4782 break;
4783 case ISD::SETLE:
4784 case ISD::SETGT:
4785 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4786 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4787 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4788 }
4789 break;
4790 case ISD::SETULE:
4791 case ISD::SETUGT:
4792 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4793 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4794 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4795 }
4796 break;
4797 }
4798 }
4799 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4800 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) {
4801 // In ARM and Thumb-2, the compare instructions can shift their second
4802 // operand.
4804 std::swap(LHS, RHS);
4805 }
4806
4807 // Thumb1 has very limited immediate modes, so turning an "and" into a
4808 // shift can save multiple instructions.
4809 //
4810 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4811 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4812 // own. If it's the operand to an unsigned comparison with an immediate,
4813 // we can eliminate one of the shifts: we transform
4814 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4815 //
4816 // We avoid transforming cases which aren't profitable due to encoding
4817 // details:
4818 //
4819 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4820 // would not; in that case, we're essentially trading one immediate load for
4821 // another.
4822 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4823 // 3. C2 is zero; we have other code for this special case.
4824 //
4825 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4826 // instruction, since the AND is always one instruction anyway, but we could
4827 // use narrow instructions in some cases.
4828 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4829 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4830 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4831 !isSignedIntSetCC(CC)) {
4832 unsigned Mask = LHS.getConstantOperandVal(1);
4833 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4834 uint64_t RHSV = RHSC->getZExtValue();
4835 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4836 unsigned ShiftBits = llvm::countl_zero(Mask);
4837 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4838 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4839 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4840 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4841 }
4842 }
4843 }
4844
4845 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4846 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4847 // way a cmp would.
4848 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4849 // some tweaks to the heuristics for the previous and->shift transform.
4850 // FIXME: Optimize cases where the LHS isn't a shift.
4851 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4852 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4853 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4854 LHS.getConstantOperandVal(1) < 31) {
4855 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4856 SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,
4857 DAG.getVTList(MVT::i32, MVT::i32),
4858 LHS.getOperand(0),
4859 DAG.getConstant(ShiftAmt, dl, MVT::i32));
4860 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
4861 Shift.getValue(1), SDValue());
4862 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4863 return Chain.getValue(1);
4864 }
4865
4866 ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4867
4868 // If the RHS is a constant zero then the V (overflow) flag will never be
4869 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4870 // simpler for other passes (like the peephole optimiser) to deal with.
4871 if (isNullConstant(RHS)) {
4872 switch (CondCode) {
4873 default: break;
4874 case ARMCC::GE:
4875 CondCode = ARMCC::PL;
4876 break;
4877 case ARMCC::LT:
4878 CondCode = ARMCC::MI;
4879 break;
4880 }
4881 }
4882
4883 ARMISD::NodeType CompareType;
4884 switch (CondCode) {
4885 default:
4886 CompareType = ARMISD::CMP;
4887 break;
4888 case ARMCC::EQ:
4889 case ARMCC::NE:
4890 // Uses only Z Flag
4891 CompareType = ARMISD::CMPZ;
4892 break;
4893 }
4894 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4895 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
4896}
4897
4898/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4899SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4900 SelectionDAG &DAG, const SDLoc &dl,
4901 bool Signaling) const {
4902 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4903 SDValue Cmp;
4904 if (!isFloatingPointZero(RHS))
4905 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP,
4906 dl, MVT::Glue, LHS, RHS);
4907 else
4908 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0,
4909 dl, MVT::Glue, LHS);
4910 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
4911}
4912
4913/// duplicateCmp - Glue values can have only one use, so this function
4914/// duplicates a comparison node.
4915SDValue
4916ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
4917 unsigned Opc = Cmp.getOpcode();
4918 SDLoc DL(Cmp);
4919 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
4920 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
4921
4922 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
4923 Cmp = Cmp.getOperand(0);
4924 Opc = Cmp.getOpcode();
4925 if (Opc == ARMISD::CMPFP)
4926 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
4927 else {
4928 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
4929 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
4930 }
4931 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
4932}
4933
4934// This function returns three things: the arithmetic computation itself
4935// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4936// comparison and the condition code define the case in which the arithmetic
4937// computation *does not* overflow.
4938std::pair<SDValue, SDValue>
4939ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4940 SDValue &ARMcc) const {
4941 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4942
4943 SDValue Value, OverflowCmp;
4944 SDValue LHS = Op.getOperand(0);
4945 SDValue RHS = Op.getOperand(1);
4946 SDLoc dl(Op);
4947
4948 // FIXME: We are currently always generating CMPs because we don't support
4949 // generating CMN through the backend. This is not as good as the natural
4950 // CMP case because it causes a register dependency and cannot be folded
4951 // later.
4952
4953 switch (Op.getOpcode()) {
4954 default:
4955 llvm_unreachable("Unknown overflow instruction!");
4956 case ISD::SADDO:
4957 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4958 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4959 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4960 break;
4961 case ISD::UADDO:
4962 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4963 // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4964 // We do not use it in the USUBO case as Value may not be used.
4965 Value = DAG.getNode(ARMISD::ADDC, dl,
4966 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4967 .getValue(0);
4968 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4969 break;
4970 case ISD::SSUBO:
4971 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4972 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4973 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4974 break;
4975 case ISD::USUBO:
4976 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4977 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4978 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4979 break;
4980 case ISD::UMULO:
4981 // We generate a UMUL_LOHI and then check if the high word is 0.
4982 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4983 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4984 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4985 LHS, RHS);
4986 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4987 DAG.getConstant(0, dl, MVT::i32));
4988 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4989 break;
4990 case ISD::SMULO:
4991 // We generate a SMUL_LOHI and then check if all the bits of the high word
4992 // are the same as the sign bit of the low word.
4993 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4994 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4995 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4996 LHS, RHS);
4997 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4998 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4999 Value.getValue(0),
5000 DAG.getConstant(31, dl, MVT::i32)));
5001 Value = Value.getValue(0); // We only want the low 32 bits for the result.
5002 break;
5003 } // switch (...)
5004
5005 return std::make_pair(Value, OverflowCmp);
5006}
5007
5008SDValue
5009ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
5010 // Let legalize expand this if it isn't a legal type yet.
5011 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
5012 return SDValue();
5013
5014 SDValue Value, OverflowCmp;
5015 SDValue ARMcc;
5016 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
5017 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5018 SDLoc dl(Op);
5019 // We use 0 and 1 as false and true values.
5020 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
5021 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
5022 EVT VT = Op.getValueType();
5023
5024 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
5025 ARMcc, CCR, OverflowCmp);
5026
5027 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
5028 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
5029}
5030
5032 SelectionDAG &DAG) {
5033 SDLoc DL(BoolCarry);
5034 EVT CarryVT = BoolCarry.getValueType();
5035
5036 // This converts the boolean value carry into the carry flag by doing
5037 // ARMISD::SUBC Carry, 1
5038 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
5039 DAG.getVTList(CarryVT, MVT::i32),
5040 BoolCarry, DAG.getConstant(1, DL, CarryVT));
5041 return Carry.getValue(1);
5042}
5043
5045 SelectionDAG &DAG) {
5046 SDLoc DL(Flags);
5047
5048 // Now convert the carry flag into a boolean carry. We do this
5049 // using ARMISD:ADDE 0, 0, Carry
5050 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
5051 DAG.getConstant(0, DL, MVT::i32),
5052 DAG.getConstant(0, DL, MVT::i32), Flags);
5053}
5054
5055SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
5056 SelectionDAG &DAG) const {
5057 // Let legalize expand this if it isn't a legal type yet.
5058 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
5059 return SDValue();
5060
5061 SDValue LHS = Op.getOperand(0);
5062 SDValue RHS = Op.getOperand(1);
5063 SDLoc dl(Op);
5064
5065 EVT VT = Op.getValueType();
5066 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
5067 SDValue Value;
5068 SDValue Overflow;
5069 switch (Op.getOpcode()) {
5070 default:
5071 llvm_unreachable("Unknown overflow instruction!");
5072 case ISD::UADDO:
5073 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
5074 // Convert the carry flag into a boolean value.
5075 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
5076 break;
5077 case ISD::USUBO: {
5078 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
5079 // Convert the carry flag into a boolean value.
5080 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
5081 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
5082 // value. So compute 1 - C.
5083 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
5084 DAG.getConstant(1, dl, MVT::i32), Overflow);
5085 break;
5086 }
5087 }
5088
5089 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
5090}
5091
5093 const ARMSubtarget *Subtarget) {
5094 EVT VT = Op.getValueType();
5095 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
5096 return SDValue();
5097 if (!VT.isSimple())
5098 return SDValue();
5099
5100 unsigned NewOpcode;
5101 switch (VT.getSimpleVT().SimpleTy) {
5102 default:
5103 return SDValue();
5104 case MVT::i8:
5105 switch (Op->getOpcode()) {
5106 case ISD::UADDSAT:
5107 NewOpcode = ARMISD::UQADD8b;
5108 break;
5109 case ISD::SADDSAT:
5110 NewOpcode = ARMISD::QADD8b;
5111 break;
5112 case ISD::USUBSAT:
5113 NewOpcode = ARMISD::UQSUB8b;
5114 break;
5115 case ISD::SSUBSAT:
5116 NewOpcode = ARMISD::QSUB8b;
5117 break;
5118 }
5119 break;
5120 case MVT::i16:
5121 switch (Op->getOpcode()) {
5122 case ISD::UADDSAT:
5123 NewOpcode = ARMISD::UQADD16b;
5124 break;
5125 case ISD::SADDSAT:
5126 NewOpcode = ARMISD::QADD16b;
5127 break;
5128 case ISD::USUBSAT:
5129 NewOpcode = ARMISD::UQSUB16b;
5130 break;
5131 case ISD::SSUBSAT:
5132 NewOpcode = ARMISD::QSUB16b;
5133 break;
5134 }
5135 break;
5136 }
5137
5138 SDLoc dl(Op);
5139 SDValue Add =
5140 DAG.getNode(NewOpcode, dl, MVT::i32,
5141 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
5142 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
5143 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
5144}
5145
5146SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
5147 SDValue Cond = Op.getOperand(0);
5148 SDValue SelectTrue = Op.getOperand(1);
5149 SDValue SelectFalse = Op.getOperand(2);
5150 SDLoc dl(Op);
5151 unsigned Opc = Cond.getOpcode();
5152
5153 if (Cond.getResNo() == 1 &&
5154 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5155 Opc == ISD::USUBO)) {
5156 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
5157 return SDValue();
5158
5159 SDValue Value, OverflowCmp;
5160 SDValue ARMcc;
5161 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5162 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5163 EVT VT = Op.getValueType();
5164
5165 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
5166 OverflowCmp, DAG);
5167 }
5168
5169 // Convert:
5170 //
5171 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
5172 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
5173 //
5174 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
5175 const ConstantSDNode *CMOVTrue =
5176 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
5177 const ConstantSDNode *CMOVFalse =
5178 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
5179
5180 if (CMOVTrue && CMOVFalse) {
5181 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
5182 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
5183
5184 SDValue True;
5185 SDValue False;
5186 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
5187 True = SelectTrue;
5188 False = SelectFalse;
5189 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
5190 True = SelectFalse;
5191 False = SelectTrue;
5192 }
5193
5194 if (True.getNode() && False.getNode()) {
5195 EVT VT = Op.getValueType();
5196 SDValue ARMcc = Cond.getOperand(2);
5197 SDValue CCR = Cond.getOperand(3);
5198 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
5199 assert(True.getValueType() == VT);
5200 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
5201 }
5202 }
5203 }
5204
5205 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
5206 // undefined bits before doing a full-word comparison with zero.
5207 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
5208 DAG.getConstant(1, dl, Cond.getValueType()));
5209
5210 return DAG.getSelectCC(dl, Cond,
5211 DAG.getConstant(0, dl, Cond.getValueType()),
5212 SelectTrue, SelectFalse, ISD::SETNE);
5213}
5214
5216 bool &swpCmpOps, bool &swpVselOps) {
5217 // Start by selecting the GE condition code for opcodes that return true for
5218 // 'equality'
5219 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
5220 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
5221 CondCode = ARMCC::GE;
5222
5223 // and GT for opcodes that return false for 'equality'.
5224 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
5225 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
5226 CondCode = ARMCC::GT;
5227
5228 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
5229 // to swap the compare operands.
5230 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
5231 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
5232 swpCmpOps = true;
5233
5234 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
5235 // If we have an unordered opcode, we need to swap the operands to the VSEL
5236 // instruction (effectively negating the condition).
5237 //
5238 // This also has the effect of swapping which one of 'less' or 'greater'
5239 // returns true, so we also swap the compare operands. It also switches
5240 // whether we return true for 'equality', so we compensate by picking the
5241 // opposite condition code to our original choice.
5242 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
5243 CC == ISD::SETUGT) {
5244 swpCmpOps = !swpCmpOps;
5245 swpVselOps = !swpVselOps;
5246 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
5247 }
5248
5249 // 'ordered' is 'anything but unordered', so use the VS condition code and
5250 // swap the VSEL operands.
5251 if (CC == ISD::SETO) {
5252 CondCode = ARMCC::VS;
5253 swpVselOps = true;
5254 }
5255
5256 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
5257 // code and swap the VSEL operands. Also do this if we don't care about the
5258 // unordered case.
5259 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
5260 CondCode = ARMCC::EQ;
5261 swpVselOps = true;
5262 }
5263}
5264
5265SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
5266 SDValue TrueVal, SDValue ARMcc, SDValue CCR,
5267 SDValue Cmp, SelectionDAG &DAG) const {
5268 if (!Subtarget->hasFP64() && VT == MVT::f64) {
5270 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
5272 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
5273
5274 SDValue TrueLow = TrueVal.getValue(0);
5275 SDValue TrueHigh = TrueVal.getValue(1);
5276 SDValue FalseLow = FalseVal.getValue(0);
5277 SDValue FalseHigh = FalseVal.getValue(1);
5278
5279 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
5280 ARMcc, CCR, Cmp);
5281 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
5282 ARMcc, CCR, duplicateCmp(Cmp, DAG));
5283
5284 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
5285 } else {
5286 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
5287 Cmp);
5288 }
5289}
5290
5292 return CC == ISD::SETGT || CC == ISD::SETGE;
5293}
5294
5296 return CC == ISD::SETLT || CC == ISD::SETLE;
5297}
5298
5299// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5300// All of these conditions (and their <= and >= counterparts) will do:
5301// x < k ? k : x
5302// x > k ? x : k
5303// k < x ? x : k
5304// k > x ? k : x
5305static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5306 const SDValue TrueVal, const SDValue FalseVal,
5307 const ISD::CondCode CC, const SDValue K) {
5308 return (isGTorGE(CC) &&
5309 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5310 (isLTorLE(CC) &&
5311 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5312}
5313
5314// Check if two chained conditionals could be converted into SSAT or USAT.
5315//
5316// SSAT can replace a set of two conditional selectors that bound a number to an
5317// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5318//
5319// x < -k ? -k : (x > k ? k : x)
5320// x < -k ? -k : (x < k ? x : k)
5321// x > -k ? (x > k ? k : x) : -k
5322// x < k ? (x < -k ? -k : x) : k
5323// etc.
5324//
5325// LLVM canonicalizes these to either a min(max()) or a max(min())
5326// pattern. This function tries to match one of these and will return a SSAT
5327// node if successful.
5328//
5329// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5330// is a power of 2.
5332 EVT VT = Op.getValueType();
5333 SDValue V1 = Op.getOperand(0);
5334 SDValue K1 = Op.getOperand(1);
5335 SDValue TrueVal1 = Op.getOperand(2);
5336 SDValue FalseVal1 = Op.getOperand(3);
5337 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5338
5339 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5340 if (Op2.getOpcode() != ISD::SELECT_CC)
5341 return SDValue();
5342
5343 SDValue V2 = Op2.getOperand(0);
5344 SDValue K2 = Op2.getOperand(1);
5345 SDValue TrueVal2 = Op2.getOperand(2);
5346 SDValue FalseVal2 = Op2.getOperand(3);
5347 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5348
5349 SDValue V1Tmp = V1;
5350 SDValue V2Tmp = V2;
5351
5352 // Check that the registers and the constants match a max(min()) or min(max())
5353 // pattern
5354 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5355 K2 != FalseVal2 ||
5356 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5357 return SDValue();
5358
5359 // Check that the constant in the lower-bound check is
5360 // the opposite of the constant in the upper-bound check
5361 // in 1's complement.
5363 return SDValue();
5364
5365 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5366 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5367 int64_t PosVal = std::max(Val1, Val2);
5368 int64_t NegVal = std::min(Val1, Val2);
5369
5370 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5371 !isPowerOf2_64(PosVal + 1))
5372 return SDValue();
5373
5374 // Handle the difference between USAT (unsigned) and SSAT (signed)
5375 // saturation
5376 // At this point, PosVal is guaranteed to be positive
5377 uint64_t K = PosVal;
5378 SDLoc dl(Op);
5379 if (Val1 == ~Val2)
5380 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5381 DAG.getConstant(llvm::countr_one(K), dl, VT));
5382 if (NegVal == 0)
5383 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5384 DAG.getConstant(llvm::countr_one(K), dl, VT));
5385
5386 return SDValue();
5387}
5388
5389// Check if a condition of the type x < k ? k : x can be converted into a
5390// bit operation instead of conditional moves.
5391// Currently this is allowed given:
5392// - The conditions and values match up
5393// - k is 0 or -1 (all ones)
5394// This function will not check the last condition, thats up to the caller
5395// It returns true if the transformation can be made, and in such case
5396// returns x in V, and k in SatK.
5398 SDValue &SatK)
5399{
5400 SDValue LHS = Op.getOperand(0);
5401 SDValue RHS = Op.getOperand(1);
5402 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5403 SDValue TrueVal = Op.getOperand(2);
5404 SDValue FalseVal = Op.getOperand(3);
5405
5407 ? &RHS
5408 : nullptr;
5409
5410 // No constant operation in comparison, early out
5411 if (!K)
5412 return false;
5413
5414 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5415 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5416 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5417
5418 // If the constant on left and right side, or variable on left and right,
5419 // does not match, early out
5420 if (*K != KTmp || V != VTmp)
5421 return false;
5422
5423 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5424 SatK = *K;
5425 return true;
5426 }
5427
5428 return false;
5429}
5430
5431bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5432 if (VT == MVT::f32)
5433 return !Subtarget->hasVFP2Base();
5434 if (VT == MVT::f64)
5435 return !Subtarget->hasFP64();
5436 if (VT == MVT::f16)
5437 return !Subtarget->hasFullFP16();
5438 return false;
5439}
5440
5441SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5442 EVT VT = Op.getValueType();
5443 SDLoc dl(Op);
5444
5445 // Try to convert two saturating conditional selects into a single SSAT
5446 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5447 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5448 return SatValue;
5449
5450 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5451 // into more efficient bit operations, which is possible when k is 0 or -1
5452 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5453 // single instructions. On Thumb the shift and the bit operation will be two
5454 // instructions.
5455 // Only allow this transformation on full-width (32-bit) operations
5456 SDValue LowerSatConstant;
5457 SDValue SatValue;
5458 if (VT == MVT::i32 &&
5459 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5460 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5461 DAG.getConstant(31, dl, VT));
5462 if (isNullConstant(LowerSatConstant)) {
5463 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5464 DAG.getAllOnesConstant(dl, VT));
5465 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5466 } else if (isAllOnesConstant(LowerSatConstant))
5467 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5468 }
5469
5470 SDValue LHS = Op.getOperand(0);
5471 SDValue RHS = Op.getOperand(1);
5472 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5473 SDValue TrueVal = Op.getOperand(2);
5474 SDValue FalseVal = Op.getOperand(3);
5475 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5476 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5477
5478 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5479 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5480 unsigned TVal = CTVal->getZExtValue();
5481 unsigned FVal = CFVal->getZExtValue();
5482 unsigned Opcode = 0;
5483
5484 if (TVal == ~FVal) {
5485 Opcode = ARMISD::CSINV;
5486 } else if (TVal == ~FVal + 1) {
5487 Opcode = ARMISD::CSNEG;
5488 } else if (TVal + 1 == FVal) {
5489 Opcode = ARMISD::CSINC;
5490 } else if (TVal == FVal + 1) {
5491 Opcode = ARMISD::CSINC;
5492 std::swap(TrueVal, FalseVal);
5493 std::swap(TVal, FVal);
5494 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5495 }
5496
5497 if (Opcode) {
5498 // If one of the constants is cheaper than another, materialise the
5499 // cheaper one and let the csel generate the other.
5500 if (Opcode != ARMISD::CSINC &&
5501 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5502 std::swap(TrueVal, FalseVal);
5503 std::swap(TVal, FVal);
5504 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5505 }
5506
5507 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5508 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5509 // -(-a) == a, but (a+1)+1 != a).
5510 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5511 std::swap(TrueVal, FalseVal);
5512 std::swap(TVal, FVal);
5513 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5514 }
5515
5516 // Drops F's value because we can get it by inverting/negating TVal.
5517 FalseVal = TrueVal;
5518
5519 SDValue ARMcc;
5520 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5521 EVT VT = TrueVal.getValueType();
5522 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
5523 }
5524 }
5525
5526 if (isUnsupportedFloatingType(LHS.getValueType())) {
5528 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5529
5530 // If softenSetCCOperands only returned one value, we should compare it to
5531 // zero.
5532 if (!RHS.getNode()) {
5533 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5534 CC = ISD::SETNE;
5535 }
5536 }
5537
5538 if (LHS.getValueType() == MVT::i32) {
5539 // Try to generate VSEL on ARMv8.
5540 // The VSEL instruction can't use all the usual ARM condition
5541 // codes: it only has two bits to select the condition code, so it's
5542 // constrained to use only GE, GT, VS and EQ.
5543 //
5544 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5545 // swap the operands of the previous compare instruction (effectively
5546 // inverting the compare condition, swapping 'less' and 'greater') and
5547 // sometimes need to swap the operands to the VSEL (which inverts the
5548 // condition in the sense of firing whenever the previous condition didn't)
5549 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5550 TrueVal.getValueType() == MVT::f32 ||
5551 TrueVal.getValueType() == MVT::f64)) {
5552 ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
5553 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5554 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5555 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5556 std::swap(TrueVal, FalseVal);
5557 }
5558 }
5559
5560 SDValue ARMcc;
5561 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5562 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5563 // Choose GE over PL, which vsel does now support
5564 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5565 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5566 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
5567 }
5568
5569 ARMCC::CondCodes CondCode, CondCode2;
5570 FPCCToARMCC(CC, CondCode, CondCode2);
5571
5572 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5573 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5574 // must use VSEL (limited condition codes), due to not having conditional f16
5575 // moves.
5576 if (Subtarget->hasFPARMv8Base() &&
5577 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5578 (TrueVal.getValueType() == MVT::f16 ||
5579 TrueVal.getValueType() == MVT::f32 ||
5580 TrueVal.getValueType() == MVT::f64)) {
5581 bool swpCmpOps = false;
5582 bool swpVselOps = false;
5583 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5584
5585 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5586 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5587 if (swpCmpOps)
5588 std::swap(LHS, RHS);
5589 if (swpVselOps)
5590 std::swap(TrueVal, FalseVal);
5591 }
5592 }
5593
5594 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5595 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5596 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5597 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
5598 if (CondCode2 != ARMCC::AL) {
5599 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5600 // FIXME: Needs another CMP because flag can have but one use.
5601 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
5602 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
5603 }
5604 return Result;
5605}
5606
5607/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5608/// to morph to an integer compare sequence.
5609static bool canChangeToInt(SDValue Op, bool &SeenZero,
5610 const ARMSubtarget *Subtarget) {
5611 SDNode *N = Op.getNode();
5612 if (!N->hasOneUse())
5613 // Otherwise it requires moving the value from fp to integer registers.
5614 return false;
5615 if (!N->getNumValues())
5616 return false;
5617 EVT VT = Op.getValueType();
5618 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5619 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5620 // vmrs are very slow, e.g. cortex-a8.
5621 return false;
5622
5623 if (isFloatingPointZero(Op)) {
5624 SeenZero = true;
5625 return true;
5626 }
5627 return ISD::isNormalLoad(N);
5628}
5629
5632 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5633
5635 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5636 Ld->getPointerInfo(), Ld->getAlign(),
5637 Ld->getMemOperand()->getFlags());
5638
5639 llvm_unreachable("Unknown VFP cmp argument!");
5640}
5641
5643 SDValue &RetVal1, SDValue &RetVal2) {
5644 SDLoc dl(Op);
5645
5646 if (isFloatingPointZero(Op)) {
5647 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5648 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5649 return;
5650 }
5651
5652 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5653 SDValue Ptr = Ld->getBasePtr();
5654 RetVal1 =
5655 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5656 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5657
5658 EVT PtrType = Ptr.getValueType();
5659 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5660 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5661 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5662 Ld->getPointerInfo().getWithOffset(4),
5663 commonAlignment(Ld->getAlign(), 4),
5664 Ld->getMemOperand()->getFlags());
5665 return;
5666 }
5667
5668 llvm_unreachable("Unknown VFP cmp argument!");
5669}
5670
5671/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
5672/// f32 and even f64 comparisons to integer ones.
5673SDValue
5674ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5675 SDValue Chain = Op.getOperand(0);
5676 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5677 SDValue LHS = Op.getOperand(2);
5678 SDValue RHS = Op.getOperand(3);
5679 SDValue Dest = Op.getOperand(4);
5680 SDLoc dl(Op);
5681
5682 bool LHSSeenZero = false;
5683 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5684 bool RHSSeenZero = false;
5685 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5686 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5687 // If unsafe fp math optimization is enabled and there are no other uses of
5688 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5689 // to an integer comparison.
5690 if (CC == ISD::SETOEQ)
5691 CC = ISD::SETEQ;
5692 else if (CC == ISD::SETUNE)
5693 CC = ISD::SETNE;
5694
5695 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5696 SDValue ARMcc;
5697 if (LHS.getValueType() == MVT::f32) {
5698 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5699 bitcastf32Toi32(LHS, DAG), Mask);
5700 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5701 bitcastf32Toi32(RHS, DAG), Mask);
5702 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5703 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5704 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
5705 Chain, Dest, ARMcc, CCR, Cmp);
5706 }
5707
5708 SDValue LHS1, LHS2;
5709 SDValue RHS1, RHS2;
5710 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5711 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5712 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5713 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5714 ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
5715 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5716 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
5717 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5718 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
5719 }
5720
5721 return SDValue();
5722}
5723
5724SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5725 SDValue Chain = Op.getOperand(0);
5726 SDValue Cond = Op.getOperand(1);
5727 SDValue Dest = Op.getOperand(2);
5728 SDLoc dl(Op);
5729
5730 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5731 // instruction.
5732 unsigned Opc = Cond.getOpcode();
5733 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5734 !Subtarget->isThumb1Only();
5735 if (Cond.getResNo() == 1 &&
5736 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5737 Opc == ISD::USUBO || OptimizeMul)) {
5738 // Only lower legal XALUO ops.
5739 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
5740 return SDValue();
5741
5742 // The actual operation with overflow check.
5743 SDValue Value, OverflowCmp;
5744 SDValue ARMcc;
5745 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5746
5747 // Reverse the condition code.
5748 ARMCC::CondCodes CondCode =
5749 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5750 CondCode = ARMCC::getOppositeCondition(CondCode);
5751 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5752 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5753
5754 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
5755 OverflowCmp);
5756 }
5757
5758 return SDValue();
5759}
5760
5761SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5762 SDValue Chain = Op.getOperand(0);
5763 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5764 SDValue LHS = Op.getOperand(2);
5765 SDValue RHS = Op.getOperand(3);
5766 SDValue Dest = Op.getOperand(4);
5767 SDLoc dl(Op);
5768
5769 if (isUnsupportedFloatingType(LHS.getValueType())) {
5771 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5772
5773 // If softenSetCCOperands only returned one value, we should compare it to
5774 // zero.
5775 if (!RHS.getNode()) {
5776 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5777 CC = ISD::SETNE;
5778 }
5779 }
5780
5781 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5782 // instruction.
5783 unsigned Opc = LHS.getOpcode();
5784 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5785 !Subtarget->isThumb1Only();
5786 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5787 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5788 Opc == ISD::USUBO || OptimizeMul) &&
5789 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5790 // Only lower legal XALUO ops.
5791 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
5792 return SDValue();
5793
5794 // The actual operation with overflow check.
5795 SDValue Value, OverflowCmp;
5796 SDValue ARMcc;
5797 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5798
5799 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5800 // Reverse the condition code.
5801 ARMCC::CondCodes CondCode =
5802 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5803 CondCode = ARMCC::getOppositeCondition(CondCode);
5804 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5805 }
5806 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5807
5808 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
5809 OverflowCmp);
5810 }
5811
5812 if (LHS.getValueType() == MVT::i32) {
5813 SDValue ARMcc;
5814 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5815 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5816 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
5817 Chain, Dest, ARMcc, CCR, Cmp);
5818 }
5819
5820 if (getTargetMachine().Options.UnsafeFPMath &&
5821 (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
5822 CC == ISD::SETNE || CC == ISD::SETUNE)) {
5823 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5824 return Result;
5825 }
5826
5827 ARMCC::CondCodes CondCode, CondCode2;
5828 FPCCToARMCC(CC, CondCode, CondCode2);
5829
5830 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5831 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5832 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5833 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
5834 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
5835 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
5836 if (CondCode2 != ARMCC::AL) {
5837 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5838 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
5839 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
5840 }
5841 return Res;
5842}
5843
5844SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5845 SDValue Chain = Op.getOperand(0);
5846 SDValue Table = Op.getOperand(1);
5847 SDValue Index = Op.getOperand(2);
5848 SDLoc dl(Op);
5849
5850 EVT PTy = getPointerTy(DAG.getDataLayout());
5852 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5853 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5854 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5855 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5856 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5857 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5858 // which does another jump to the destination. This also makes it easier
5859 // to translate it to TBB / TBH later (Thumb2 only).
5860 // FIXME: This might not work if the function is extremely large.
5861 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5862 Addr, Op.getOperand(2), JTI);
5863 }
5864 if (isPositionIndependent() || Subtarget->isROPI()) {
5865 Addr =
5866 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5868 Chain = Addr.getValue(1);
5869 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5870 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5871 } else {
5872 Addr =
5873 DAG.getLoad(PTy, dl, Chain, Addr,
5875 Chain = Addr.getValue(1);
5876 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5877 }
5878}
5879
5881 EVT VT = Op.getValueType();
5882 SDLoc dl(Op);
5883
5884 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5885 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5886 return Op;
5887 return DAG.UnrollVectorOp(Op.getNode());
5888 }
5889
5890 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5891
5892 EVT NewTy;
5893 const EVT OpTy = Op.getOperand(0).getValueType();
5894 if (OpTy == MVT::v4f32)
5895 NewTy = MVT::v4i32;
5896 else if (OpTy == MVT::v4f16 && HasFullFP16)
5897 NewTy = MVT::v4i16;
5898 else if (OpTy == MVT::v8f16 && HasFullFP16)
5899 NewTy = MVT::v8i16;
5900 else
5901 llvm_unreachable("Invalid type for custom lowering!");
5902
5903 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5904 return DAG.UnrollVectorOp(Op.getNode());
5905
5906 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5907 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5908}
5909
5910SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5911 EVT VT = Op.getValueType();
5912 if (VT.isVector())
5913 return LowerVectorFP_TO_INT(Op, DAG);
5914
5915 bool IsStrict = Op->isStrictFPOpcode();
5916 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5917
5918 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5919 RTLIB::Libcall LC;
5920 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5921 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5922 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5923 Op.getValueType());
5924 else
5925 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5926 Op.getValueType());
5927 SDLoc Loc(Op);
5928 MakeLibCallOptions CallOptions;
5929 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5931 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5932 CallOptions, Loc, Chain);
5933 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5934 }
5935
5936 // FIXME: Remove this when we have strict fp instruction selection patterns
5937 if (IsStrict) {
5938 SDLoc Loc(Op);
5939 SDValue Result =
5942 Loc, Op.getValueType(), SrcVal);
5943 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5944 }
5945
5946 return Op;
5947}
5948
5950 const ARMSubtarget *Subtarget) {
5951 EVT VT = Op.getValueType();
5952 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5953 EVT FromVT = Op.getOperand(0).getValueType();
5954
5955 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5956 return Op;
5957 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5958 Subtarget->hasFP64())
5959 return Op;
5960 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5961 Subtarget->hasFullFP16())
5962 return Op;
5963 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5964 Subtarget->hasMVEFloatOps())
5965 return Op;
5966 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5967 Subtarget->hasMVEFloatOps())
5968 return Op;
5969
5970 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5971 return SDValue();
5972
5973 SDLoc DL(Op);
5974 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5975 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5976 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5977 DAG.getValueType(VT.getScalarType()));
5978 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5979 DAG.getConstant((1 << BW) - 1, DL, VT));
5980 if (IsSigned)
5981 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5982 DAG.getConstant(-(1 << BW), DL, VT));
5983 return Max;
5984}
5985
5987 EVT VT = Op.getValueType();
5988 SDLoc dl(Op);
5989
5990 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5991 if (VT.getVectorElementType() == MVT::f32)
5992 return Op;
5993 return DAG.UnrollVectorOp(Op.getNode());
5994 }
5995
5996 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5997 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5998 "Invalid type for custom lowering!");
5999
6000 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
6001
6002 EVT DestVecType;
6003 if (VT == MVT::v4f32)
6004 DestVecType = MVT::v4i32;
6005 else if (VT == MVT::v4f16 && HasFullFP16)
6006 DestVecType = MVT::v4i16;
6007 else if (VT == MVT::v8f16 && HasFullFP16)
6008 DestVecType = MVT::v8i16;
6009 else
6010 return DAG.UnrollVectorOp(Op.getNode());
6011
6012 unsigned CastOpc;
6013 unsigned Opc;
6014 switch (Op.getOpcode()) {
6015 default: llvm_unreachable("Invalid opcode!");
6016 case ISD::SINT_TO_FP:
6017 CastOpc = ISD::SIGN_EXTEND;
6018 Opc = ISD::SINT_TO_FP;
6019 break;
6020 case ISD::UINT_TO_FP:
6021 CastOpc = ISD::ZERO_EXTEND;
6022 Opc = ISD::UINT_TO_FP;
6023 break;
6024 }
6025
6026 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
6027 return DAG.getNode(Opc, dl, VT, Op);
6028}
6029
6030SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
6031 EVT VT = Op.getValueType();
6032 if (VT.isVector())
6033 return LowerVectorINT_TO_FP(Op, DAG);
6034 if (isUnsupportedFloatingType(VT)) {
6035 RTLIB::Libcall LC;
6036 if (Op.getOpcode() == ISD::SINT_TO_FP)
6037 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
6038 Op.getValueType());
6039 else
6040 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
6041 Op.getValueType());
6042 MakeLibCallOptions CallOptions;
6043 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
6044 CallOptions, SDLoc(Op)).first;
6045 }
6046
6047 return Op;
6048}
6049
6050SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
6051 // Implement fcopysign with a fabs and a conditional fneg.
6052 SDValue Tmp0 = Op.getOperand(0);
6053 SDValue Tmp1 = Op.getOperand(1);
6054 SDLoc dl(Op);
6055 EVT VT = Op.getValueType();
6056 EVT SrcVT = Tmp1.getValueType();
6057 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
6058 Tmp0.getOpcode() == ARMISD::VMOVDRR;
6059 bool UseNEON = !InGPR && Subtarget->hasNEON();
6060
6061 if (UseNEON) {
6062 // Use VBSL to copy the sign bit.
6063 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
6064 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
6065 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
6066 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
6067 if (VT == MVT::f64)
6068 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
6069 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
6070 DAG.getConstant(32, dl, MVT::i32));
6071 else /*if (VT == MVT::f32)*/
6072 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
6073 if (SrcVT == MVT::f32) {
6074 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
6075 if (VT == MVT::f64)
6076 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
6077 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
6078 DAG.getConstant(32, dl, MVT::i32));
6079 } else if (VT == MVT::f32)
6080 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
6081 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
6082 DAG.getConstant(32, dl, MVT::i32));
6083 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
6084 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
6085
6087 dl, MVT::i32);
6088 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
6089 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
6090 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
6091
6092 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
6093 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
6094 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
6095 if (VT == MVT::f32) {
6096 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
6097 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
6098 DAG.getConstant(0, dl, MVT::i32));
6099 } else {
6100 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
6101 }
6102
6103 return Res;
6104 }
6105
6106 // Bitcast operand 1 to i32.
6107 if (SrcVT == MVT::f64)
6108 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6109 Tmp1).getValue(1);
6110 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
6111
6112 // Or in the signbit with integer operations.
6113 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
6114 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
6115 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
6116 if (VT == MVT::f32) {
6117 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
6118 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
6119 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
6120 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
6121 }
6122
6123 // f64: Or the high part with signbit and then combine two parts.
6124 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6125 Tmp0);
6126 SDValue Lo = Tmp0.getValue(0);
6127 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
6128 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
6129 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
6130}
6131
6132SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
6134 MachineFrameInfo &MFI = MF.getFrameInfo();
6135 MFI.setReturnAddressIsTaken(true);
6136
6138 return SDValue();
6139
6140 EVT VT = Op.getValueType();
6141 SDLoc dl(Op);
6142 unsigned Depth = Op.getConstantOperandVal(0);
6143 if (Depth) {
6144 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
6145 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
6146 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
6147 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
6149 }
6150
6151 // Return LR, which contains the return address. Mark it an implicit live-in.
6152 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
6153 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
6154}
6155
6156SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
6157 const ARMBaseRegisterInfo &ARI =
6158 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
6160 MachineFrameInfo &MFI = MF.getFrameInfo();
6161 MFI.setFrameAddressIsTaken(true);
6162
6163 EVT VT = Op.getValueType();
6164 SDLoc dl(Op); // FIXME probably not meaningful
6165 unsigned Depth = Op.getConstantOperandVal(0);
6166 Register FrameReg = ARI.getFrameRegister(MF);
6167 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
6168 while (Depth--)
6169 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
6171 return FrameAddr;
6172}
6173
6174// FIXME? Maybe this could be a TableGen attribute on some registers and
6175// this table could be generated automatically from RegInfo.
6176Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
6177 const MachineFunction &MF) const {
6179 .Case("sp", ARM::SP)
6180 .Default(0);
6181 if (Reg)
6182 return Reg;
6183 report_fatal_error(Twine("Invalid register name \""
6184 + StringRef(RegName) + "\"."));
6185}
6186
6187// Result is 64 bit value so split into two 32 bit values and return as a
6188// pair of values.
6190 SelectionDAG &DAG) {
6191 SDLoc DL(N);
6192
6193 // This function is only supposed to be called for i64 type destination.
6194 assert(N->getValueType(0) == MVT::i64
6195 && "ExpandREAD_REGISTER called for non-i64 type result.");
6196
6198 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
6199 N->getOperand(0),
6200 N->getOperand(1));
6201
6202 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
6203 Read.getValue(1)));
6204 Results.push_back(Read.getOperand(0));
6205}
6206
6207/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
6208/// When \p DstVT, the destination type of \p BC, is on the vector
6209/// register bank and the source of bitcast, \p Op, operates on the same bank,
6210/// it might be possible to combine them, such that everything stays on the
6211/// vector register bank.
6212/// \p return The node that would replace \p BT, if the combine
6213/// is possible.
6215 SelectionDAG &DAG) {
6216 SDValue Op = BC->getOperand(0);
6217 EVT DstVT = BC->getValueType(0);
6218
6219 // The only vector instruction that can produce a scalar (remember,
6220 // since the bitcast was about to be turned into VMOVDRR, the source
6221 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
6222 // Moreover, we can do this combine only if there is one use.
6223 // Finally, if the destination type is not a vector, there is not
6224 // much point on forcing everything on the vector bank.
6225 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6226 !Op.hasOneUse())
6227 return SDValue();
6228
6229 // If the index is not constant, we will introduce an additional
6230 // multiply that will stick.
6231 // Give up in that case.
6233 if (!Index)
6234 return SDValue();
6235 unsigned DstNumElt = DstVT.getVectorNumElements();
6236
6237 // Compute the new index.
6238 const APInt &APIntIndex = Index->getAPIntValue();
6239 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
6240 NewIndex *= APIntIndex;
6241 // Check if the new constant index fits into i32.
6242 if (NewIndex.getBitWidth() > 32)
6243 return SDValue();
6244
6245 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
6246 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
6247 SDLoc dl(Op);
6248 SDValue ExtractSrc = Op.getOperand(0);
6249 EVT VecVT = EVT::getVectorVT(
6250 *DAG.getContext(), DstVT.getScalarType(),
6251 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
6252 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
6253 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
6254 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
6255}
6256
6257/// ExpandBITCAST - If the target supports VFP, this function is called to
6258/// expand a bit convert where either the source or destination type is i64 to
6259/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6260/// operand type is illegal (e.g., v2f32 for a target that doesn't support
6261/// vectors), since the legalizer won't know what to do with that.
6262SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
6263 const ARMSubtarget *Subtarget) const {
6264 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6265 SDLoc dl(N);
6266 SDValue Op = N->getOperand(0);
6267
6268 // This function is only supposed to be called for i16 and i64 types, either
6269 // as the source or destination of the bit convert.
6270 EVT SrcVT = Op.getValueType();
6271 EVT DstVT = N->getValueType(0);
6272
6273 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
6274 (DstVT == MVT::f16 || DstVT == MVT::bf16))
6275 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
6276 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
6277
6278 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
6279 (SrcVT == MVT::f16 || SrcVT == MVT::bf16))
6280 return DAG.getNode(
6281 ISD::TRUNCATE, SDLoc(N), DstVT,
6282 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
6283
6284 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
6285 return SDValue();
6286
6287 // Turn i64->f64 into VMOVDRR.
6288 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
6289 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6290 // if we can combine the bitcast with its source.
6292 return Val;
6293 SDValue Lo, Hi;
6294 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
6295 return DAG.getNode(ISD::BITCAST, dl, DstVT,
6296 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
6297 }
6298
6299 // Turn f64->i64 into VMOVRRD.
6300 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
6301 SDValue Cvt;
6302 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6303 SrcVT.getVectorNumElements() > 1)
6304 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6305 DAG.getVTList(MVT::i32, MVT::i32),
6306 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6307 else
6308 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6309 DAG.getVTList(MVT::i32, MVT::i32), Op);
6310 // Merge the pieces into a single i64 value.
6311 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6312 }
6313
6314 return SDValue();
6315}
6316
6317/// getZeroVector - Returns a vector of specified type with all zero elements.
6318/// Zero vectors are used to represent vector negation and in those cases
6319/// will be implemented with the NEON VNEG instruction. However, VNEG does
6320/// not support i64 elements, so sometimes the zero vectors will need to be
6321/// explicitly constructed. Regardless, use a canonical VMOV to create the
6322/// zero vector.
6323static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6324 assert(VT.isVector() && "Expected a vector type");
6325 // The canonical modified immediate encoding of a zero vector is....0!
6326 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6327 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6328 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6329 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6330}
6331
6332/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6333/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6334SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6335 SelectionDAG &DAG) const {
6336 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6337 EVT VT = Op.getValueType();
6338 unsigned VTBits = VT.getSizeInBits();
6339 SDLoc dl(Op);
6340 SDValue ShOpLo = Op.getOperand(0);
6341 SDValue ShOpHi = Op.getOperand(1);
6342 SDValue ShAmt = Op.getOperand(2);
6343 SDValue ARMcc;
6344 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6345 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6346
6347 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6348
6349 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6350 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6351 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6352 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6353 DAG.getConstant(VTBits, dl, MVT::i32));
6354 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6355 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6356 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6357 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6358 ISD::SETGE, ARMcc, DAG, dl);
6359 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,
6360 ARMcc, CCR, CmpLo);
6361
6362 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6363 SDValue HiBigShift = Opc == ISD::SRA
6364 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6365 DAG.getConstant(VTBits - 1, dl, VT))
6366 : DAG.getConstant(0, dl, VT);
6367 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6368 ISD::SETGE, ARMcc, DAG, dl);
6369 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
6370 ARMcc, CCR, CmpHi);
6371
6372 SDValue Ops[2] = { Lo, Hi };
6373 return DAG.getMergeValues(Ops, dl);
6374}
6375
6376/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6377/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6378SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6379 SelectionDAG &DAG) const {
6380 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6381 EVT VT = Op.getValueType();
6382 unsigned VTBits = VT.getSizeInBits();
6383 SDLoc dl(Op);
6384 SDValue ShOpLo = Op.getOperand(0);
6385 SDValue ShOpHi = Op.getOperand(1);
6386 SDValue ShAmt = Op.getOperand(2);
6387 SDValue ARMcc;
6388 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6389
6390 assert(Op.getOpcode() == ISD::SHL_PARTS);
6391 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6392 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6393 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6394 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6395 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6396
6397 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6398 DAG.getConstant(VTBits, dl, MVT::i32));
6399 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6400 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6401 ISD::SETGE, ARMcc, DAG, dl);
6402 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
6403 ARMcc, CCR, CmpHi);
6404
6405 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6406 ISD::SETGE, ARMcc, DAG, dl);
6407 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6408 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6409 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);
6410
6411 SDValue Ops[2] = { Lo, Hi };
6412 return DAG.getMergeValues(Ops, dl);
6413}
6414
6415SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6416 SelectionDAG &DAG) const {
6417 // The rounding mode is in bits 23:22 of the FPSCR.
6418 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6419 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6420 // so that the shift + and get folded into a bitfield extract.
6421 SDLoc dl(Op);
6422 SDValue Chain = Op.getOperand(0);
6423 SDValue Ops[] = {Chain,
6424 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6425
6426 SDValue FPSCR =
6427 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6428 Chain = FPSCR.getValue(1);
6429 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6430 DAG.getConstant(1U << 22, dl, MVT::i32));
6431 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6432 DAG.getConstant(22, dl, MVT::i32));
6433 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6434 DAG.getConstant(3, dl, MVT::i32));
6435 return DAG.getMergeValues({And, Chain}, dl);
6436}
6437
6438SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6439 SelectionDAG &DAG) const {
6440 SDLoc DL(Op);
6441 SDValue Chain = Op->getOperand(0);
6442 SDValue RMValue = Op->getOperand(1);
6443
6444 // The rounding mode is in bits 23:22 of the FPSCR.
6445 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6446 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6447 // ((arg - 1) & 3) << 22).
6448 //
6449 // It is expected that the argument of llvm.set.rounding is within the
6450 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6451 // responsibility of the code generated llvm.set.rounding to ensure this
6452 // condition.
6453
6454 // Calculate new value of FPSCR[23:22].
6455 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6456 DAG.getConstant(1, DL, MVT::i32));
6457 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6458 DAG.getConstant(0x3, DL, MVT::i32));
6459 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6460 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6461
6462 // Get current value of FPSCR.
6463 SDValue Ops[] = {Chain,
6464 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6465 SDValue FPSCR =
6466 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6467 Chain = FPSCR.getValue(1);
6468 FPSCR = FPSCR.getValue(0);
6469
6470 // Put new rounding mode into FPSCR[23:22].
6471 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6472 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6473 DAG.getConstant(RMMask, DL, MVT::i32));
6474 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6475 SDValue Ops2[] = {
6476 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6477 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6478}
6479
6480SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6481 SelectionDAG &DAG) const {
6482 SDLoc DL(Op);
6483 SDValue Chain = Op->getOperand(0);
6484 SDValue Mode = Op->getOperand(1);
6485
6486 // Generate nodes to build:
6487 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6488 SDValue Ops[] = {Chain,
6489 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6490 SDValue FPSCR =
6491 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6492 Chain = FPSCR.getValue(1);
6493 FPSCR = FPSCR.getValue(0);
6494
6495 SDValue FPSCRMasked =
6496 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6497 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6498 SDValue InputMasked =
6499 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6500 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6501 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6502
6503 SDValue Ops2[] = {
6504 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6505 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6506}
6507
6508SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6509 SelectionDAG &DAG) const {
6510 SDLoc DL(Op);
6511 SDValue Chain = Op->getOperand(0);
6512
6513 // To get the default FP mode all control bits are cleared:
6514 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6515 SDValue Ops[] = {Chain,
6516 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6517 SDValue FPSCR =
6518 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6519 Chain = FPSCR.getValue(1);
6520 FPSCR = FPSCR.getValue(0);
6521
6522 SDValue FPSCRMasked = DAG.getNode(
6523 ISD::AND, DL, MVT::i32, FPSCR,
6525 SDValue Ops2[] = {Chain,
6526 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6527 FPSCRMasked};
6528 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6529}
6530
6532 const ARMSubtarget *ST) {
6533 SDLoc dl(N);
6534 EVT VT = N->getValueType(0);
6535 if (VT.isVector() && ST->hasNEON()) {
6536
6537 // Compute the least significant set bit: LSB = X & -X
6538 SDValue X = N->getOperand(0);
6539 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6540 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6541
6542 EVT ElemTy = VT.getVectorElementType();
6543
6544 if (ElemTy == MVT::i8) {
6545 // Compute with: cttz(x) = ctpop(lsb - 1)
6546 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6547 DAG.getTargetConstant(1, dl, ElemTy));
6548 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6549 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6550 }
6551
6552 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6553 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6554 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6555 unsigned NumBits = ElemTy.getSizeInBits();
6556 SDValue WidthMinus1 =
6557 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6558 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6559 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6560 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6561 }
6562
6563 // Compute with: cttz(x) = ctpop(lsb - 1)
6564
6565 // Compute LSB - 1.
6566 SDValue Bits;
6567 if (ElemTy == MVT::i64) {
6568 // Load constant 0xffff'ffff'ffff'ffff to register.
6569 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6570 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6571 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6572 } else {
6573 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6574 DAG.getTargetConstant(1, dl, ElemTy));
6575 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6576 }
6577 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6578 }
6579
6580 if (!ST->hasV6T2Ops())
6581 return SDValue();
6582
6583 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6584 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6585}
6586
6588 const ARMSubtarget *ST) {
6589 EVT VT = N->getValueType(0);
6590 SDLoc DL(N);
6591
6592 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6593 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6594 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6595 "Unexpected type for custom ctpop lowering");
6596
6597 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6598 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6599 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6600 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6601
6602 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6603 unsigned EltSize = 8;
6604 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6605 while (EltSize != VT.getScalarSizeInBits()) {
6607 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6608 TLI.getPointerTy(DAG.getDataLayout())));
6609 Ops.push_back(Res);
6610
6611 EltSize *= 2;
6612 NumElts /= 2;
6613 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6614 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6615 }
6616
6617 return Res;
6618}
6619
6620/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6621/// operand of a vector shift operation, where all the elements of the
6622/// build_vector must have the same constant integer value.
6623static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6624 // Ignore bit_converts.
6625 while (Op.getOpcode() == ISD::BITCAST)
6626 Op = Op.getOperand(0);
6628 APInt SplatBits, SplatUndef;
6629 unsigned SplatBitSize;
6630 bool HasAnyUndefs;
6631 if (!BVN ||
6632 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6633 ElementBits) ||
6634 SplatBitSize > ElementBits)
6635 return false;
6636 Cnt = SplatBits.getSExtValue();
6637 return true;
6638}
6639
6640/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6641/// operand of a vector shift left operation. That value must be in the range:
6642/// 0 <= Value < ElementBits for a left shift; or
6643/// 0 <= Value <= ElementBits for a long left shift.
6644static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6645 assert(VT.isVector() && "vector shift count is not a vector type");
6646 int64_t ElementBits = VT.getScalarSizeInBits();
6647 if (!getVShiftImm(Op, ElementBits, Cnt))
6648 return false;
6649 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6650}
6651
6652/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6653/// operand of a vector shift right operation. For a shift opcode, the value
6654/// is positive, but for an intrinsic the value count must be negative. The
6655/// absolute value must be in the range:
6656/// 1 <= |Value| <= ElementBits for a right shift; or
6657/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6658static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6659 int64_t &Cnt) {
6660 assert(VT.isVector() && "vector shift count is not a vector type");
6661 int64_t ElementBits = VT.getScalarSizeInBits();
6662 if (!getVShiftImm(Op, ElementBits, Cnt))
6663 return false;
6664 if (!isIntrinsic)
6665 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6666 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6667 Cnt = -Cnt;
6668 return true;
6669 }
6670 return false;
6671}
6672
6674 const ARMSubtarget *ST) {
6675 EVT VT = N->getValueType(0);
6676 SDLoc dl(N);
6677 int64_t Cnt;
6678
6679 if (!VT.isVector())
6680 return SDValue();
6681
6682 // We essentially have two forms here. Shift by an immediate and shift by a
6683 // vector register (there are also shift by a gpr, but that is just handled
6684 // with a tablegen pattern). We cannot easily match shift by an immediate in
6685 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6686 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6687 // signed or unsigned, and a negative shift indicates a shift right).
6688 if (N->getOpcode() == ISD::SHL) {
6689 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6690 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6691 DAG.getConstant(Cnt, dl, MVT::i32));
6692 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6693 N->getOperand(1));
6694 }
6695
6696 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6697 "unexpected vector shift opcode");
6698
6699 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6700 unsigned VShiftOpc =
6701 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6702 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6703 DAG.getConstant(Cnt, dl, MVT::i32));
6704 }
6705
6706 // Other right shifts we don't have operations for (we use a shift left by a
6707 // negative number).
6708 EVT ShiftVT = N->getOperand(1).getValueType();
6709 SDValue NegatedCount = DAG.getNode(
6710 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6711 unsigned VShiftOpc =
6712 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6713 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6714}
6715
6717 const ARMSubtarget *ST) {
6718 EVT VT = N->getValueType(0);
6719 SDLoc dl(N);
6720
6721 // We can get here for a node like i32 = ISD::SHL i32, i64
6722 if (VT != MVT::i64)
6723 return SDValue();
6724
6725 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6726 N->getOpcode() == ISD::SHL) &&
6727 "Unknown shift to lower!");
6728
6729 unsigned ShOpc = N->getOpcode();
6730 if (ST->hasMVEIntegerOps()) {
6731 SDValue ShAmt = N->getOperand(1);
6732 unsigned ShPartsOpc = ARMISD::LSLL;
6734
6735 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6736 // then do the default optimisation
6737 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6738 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6739 return SDValue();
6740
6741 // Extract the lower 32 bits of the shift amount if it's not an i32
6742 if (ShAmt->getValueType(0) != MVT::i32)
6743 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6744
6745 if (ShOpc == ISD::SRL) {
6746 if (!Con)
6747 // There is no t2LSRLr instruction so negate and perform an lsll if the
6748 // shift amount is in a register, emulating a right shift.
6749 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6750 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6751 else
6752 // Else generate an lsrl on the immediate shift amount
6753 ShPartsOpc = ARMISD::LSRL;
6754 } else if (ShOpc == ISD::SRA)
6755 ShPartsOpc = ARMISD::ASRL;
6756
6757 // Split Lower/Upper 32 bits of the destination/source
6758 SDValue Lo, Hi;
6759 std::tie(Lo, Hi) =
6760 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6761 // Generate the shift operation as computed above
6762 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6763 ShAmt);
6764 // The upper 32 bits come from the second return value of lsll
6765 Hi = SDValue(Lo.getNode(), 1);
6766 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6767 }
6768
6769 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6770 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6771 return SDValue();
6772
6773 // If we are in thumb mode, we don't have RRX.
6774 if (ST->isThumb1Only())
6775 return SDValue();
6776
6777 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6778 SDValue Lo, Hi;
6779 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6780
6781 // First, build a SRA_GLUE/SRL_GLUE op, which shifts the top part by one and
6782 // captures the result into a carry flag.
6783 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_GLUE:ARMISD::SRA_GLUE;
6784 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
6785
6786 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6787 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6788
6789 // Merge the pieces into a single i64 value.
6790 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6791}
6792
6794 const ARMSubtarget *ST) {
6795 bool Invert = false;
6796 bool Swap = false;
6797 unsigned Opc = ARMCC::AL;
6798
6799 SDValue Op0 = Op.getOperand(0);
6800 SDValue Op1 = Op.getOperand(1);
6801 SDValue CC = Op.getOperand(2);
6802 EVT VT = Op.getValueType();
6803 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6804 SDLoc dl(Op);
6805
6806 EVT CmpVT;
6807 if (ST->hasNEON())
6809 else {
6810 assert(ST->hasMVEIntegerOps() &&
6811 "No hardware support for integer vector comparison!");
6812
6813 if (Op.getValueType().getVectorElementType() != MVT::i1)
6814 return SDValue();
6815
6816 // Make sure we expand floating point setcc to scalar if we do not have
6817 // mve.fp, so that we can handle them from there.
6818 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6819 return SDValue();
6820
6821 CmpVT = VT;
6822 }
6823
6824 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6825 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6826 // Special-case integer 64-bit equality comparisons. They aren't legal,
6827 // but they can be lowered with a few vector instructions.
6828 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6829 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6830 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6831 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6832 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6833 DAG.getCondCode(ISD::SETEQ));
6834 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6835 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6836 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6837 if (SetCCOpcode == ISD::SETNE)
6838 Merged = DAG.getNOT(dl, Merged, CmpVT);
6839 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6840 return Merged;
6841 }
6842
6843 if (CmpVT.getVectorElementType() == MVT::i64)
6844 // 64-bit comparisons are not legal in general.
6845 return SDValue();
6846
6847 if (Op1.getValueType().isFloatingPoint()) {
6848 switch (SetCCOpcode) {
6849 default: llvm_unreachable("Illegal FP comparison");
6850 case ISD::SETUNE:
6851 case ISD::SETNE:
6852 if (ST->hasMVEFloatOps()) {
6853 Opc = ARMCC::NE; break;
6854 } else {
6855 Invert = true; [[fallthrough]];
6856 }
6857 case ISD::SETOEQ:
6858 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6859 case ISD::SETOLT:
6860 case ISD::SETLT: Swap = true; [[fallthrough]];
6861 case ISD::SETOGT:
6862 case ISD::SETGT: Opc = ARMCC::GT; break;
6863 case ISD::SETOLE:
6864 case ISD::SETLE: Swap = true; [[fallthrough]];
6865 case ISD::SETOGE:
6866 case ISD::SETGE: Opc = ARMCC::GE; break;
6867 case ISD::SETUGE: Swap = true; [[fallthrough]];
6868 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6869 case ISD::SETUGT: Swap = true; [[fallthrough]];
6870 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6871 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6872 case ISD::SETONE: {
6873 // Expand this to (OLT | OGT).
6874 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6875 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6876 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6877 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6878 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6879 if (Invert)
6880 Result = DAG.getNOT(dl, Result, VT);
6881 return Result;
6882 }
6883 case ISD::SETUO: Invert = true; [[fallthrough]];
6884 case ISD::SETO: {
6885 // Expand this to (OLT | OGE).
6886 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6887 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6888 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6889 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6890 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6891 if (Invert)
6892 Result = DAG.getNOT(dl, Result, VT);
6893 return Result;
6894 }
6895 }
6896 } else {
6897 // Integer comparisons.
6898 switch (SetCCOpcode) {
6899 default: llvm_unreachable("Illegal integer comparison");
6900 case ISD::SETNE:
6901 if (ST->hasMVEIntegerOps()) {
6902 Opc = ARMCC::NE; break;
6903 } else {
6904 Invert = true; [[fallthrough]];
6905 }
6906 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6907 case ISD::SETLT: Swap = true; [[fallthrough]];
6908 case ISD::SETGT: Opc = ARMCC::GT; break;
6909 case ISD::SETLE: Swap = true; [[fallthrough]];
6910 case ISD::SETGE: Opc = ARMCC::GE; break;
6911 case ISD::SETULT: Swap = true; [[fallthrough]];
6912 case ISD::SETUGT: Opc = ARMCC::HI; break;
6913 case ISD::SETULE: Swap = true; [[fallthrough]];
6914 case ISD::SETUGE: Opc = ARMCC::HS; break;
6915 }
6916
6917 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6918 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6919 SDValue AndOp;
6921 AndOp = Op0;
6922 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6923 AndOp = Op1;
6924
6925 // Ignore bitconvert.
6926 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6927 AndOp = AndOp.getOperand(0);
6928
6929 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6930 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6931 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6932 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6933 if (!Invert)
6934 Result = DAG.getNOT(dl, Result, VT);
6935 return Result;
6936 }
6937 }
6938 }
6939
6940 if (Swap)
6941 std::swap(Op0, Op1);
6942
6943 // If one of the operands is a constant vector zero, attempt to fold the
6944 // comparison to a specialized compare-against-zero form.
6946 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6947 Opc == ARMCC::NE)) {
6948 if (Opc == ARMCC::GE)
6949 Opc = ARMCC::LE;
6950 else if (Opc == ARMCC::GT)
6951 Opc = ARMCC::LT;
6952 std::swap(Op0, Op1);
6953 }
6954
6955 SDValue Result;
6957 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6958 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6959 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6960 DAG.getConstant(Opc, dl, MVT::i32));
6961 else
6962 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6963 DAG.getConstant(Opc, dl, MVT::i32));
6964
6965 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6966
6967 if (Invert)
6968 Result = DAG.getNOT(dl, Result, VT);
6969
6970 return Result;
6971}
6972
6974 SDValue LHS = Op.getOperand(0);
6975 SDValue RHS = Op.getOperand(1);
6976 SDValue Carry = Op.getOperand(2);
6977 SDValue Cond = Op.getOperand(3);
6978 SDLoc DL(Op);
6979
6980 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6981
6982 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6983 // have to invert the carry first.
6984 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6985 DAG.getConstant(1, DL, MVT::i32), Carry);
6986 // This converts the boolean value carry into the carry flag.
6987 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6988
6989 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6990 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6991
6992 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6993 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6994 SDValue ARMcc = DAG.getConstant(
6995 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6996 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6997 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
6998 Cmp.getValue(1), SDValue());
6999 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
7000 CCR, Chain.getValue(1));
7001}
7002
7003/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
7004/// valid vector constant for a NEON or MVE instruction with a "modified
7005/// immediate" operand (e.g., VMOV). If so, return the encoded value.
7006static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
7007 unsigned SplatBitSize, SelectionDAG &DAG,
7008 const SDLoc &dl, EVT &VT, EVT VectorVT,
7009 VMOVModImmType type) {
7010 unsigned OpCmode, Imm;
7011 bool is128Bits = VectorVT.is128BitVector();
7012
7013 // SplatBitSize is set to the smallest size that splats the vector, so a
7014 // zero vector will always have SplatBitSize == 8. However, NEON modified
7015 // immediate instructions others than VMOV do not support the 8-bit encoding
7016 // of a zero vector, and the default encoding of zero is supposed to be the
7017 // 32-bit version.
7018 if (SplatBits == 0)
7019 SplatBitSize = 32;
7020
7021 switch (SplatBitSize) {
7022 case 8:
7023 if (type != VMOVModImm)
7024 return SDValue();
7025 // Any 1-byte value is OK. Op=0, Cmode=1110.
7026 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
7027 OpCmode = 0xe;
7028 Imm = SplatBits;
7029 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
7030 break;
7031
7032 case 16:
7033 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
7034 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
7035 if ((SplatBits & ~0xff) == 0) {
7036 // Value = 0x00nn: Op=x, Cmode=100x.
7037 OpCmode = 0x8;
7038 Imm = SplatBits;
7039 break;
7040 }
7041 if ((SplatBits & ~0xff00) == 0) {
7042 // Value = 0xnn00: Op=x, Cmode=101x.
7043 OpCmode = 0xa;
7044 Imm = SplatBits >> 8;
7045 break;
7046 }
7047 return SDValue();
7048
7049 case 32:
7050 // NEON's 32-bit VMOV supports splat values where:
7051 // * only one byte is nonzero, or
7052 // * the least significant byte is 0xff and the second byte is nonzero, or
7053 // * the least significant 2 bytes are 0xff and the third is nonzero.
7054 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
7055 if ((SplatBits & ~0xff) == 0) {
7056 // Value = 0x000000nn: Op=x, Cmode=000x.
7057 OpCmode = 0;
7058 Imm = SplatBits;
7059 break;
7060 }
7061 if ((SplatBits & ~0xff00) == 0) {
7062 // Value = 0x0000nn00: Op=x, Cmode=001x.
7063 OpCmode = 0x2;
7064 Imm = SplatBits >> 8;
7065 break;
7066 }
7067 if ((SplatBits & ~0xff0000) == 0) {
7068 // Value = 0x00nn0000: Op=x, Cmode=010x.
7069 OpCmode = 0x4;
7070 Imm = SplatBits >> 16;
7071 break;
7072 }
7073 if ((SplatBits & ~0xff000000) == 0) {
7074 // Value = 0xnn000000: Op=x, Cmode=011x.
7075 OpCmode = 0x6;
7076 Imm = SplatBits >> 24;
7077 break;
7078 }
7079
7080 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
7081 if (type == OtherModImm) return SDValue();
7082
7083 if ((SplatBits & ~0xffff) == 0 &&
7084 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
7085 // Value = 0x0000nnff: Op=x, Cmode=1100.
7086 OpCmode = 0xc;
7087 Imm = SplatBits >> 8;
7088 break;
7089 }
7090
7091 // cmode == 0b1101 is not supported for MVE VMVN
7092 if (type == MVEVMVNModImm)
7093 return SDValue();
7094
7095 if ((SplatBits & ~0xffffff) == 0 &&
7096 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
7097 // Value = 0x00nnffff: Op=x, Cmode=1101.
7098 OpCmode = 0xd;
7099 Imm = SplatBits >> 16;
7100 break;
7101 }
7102
7103 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
7104 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
7105 // VMOV.I32. A (very) minor optimization would be to replicate the value
7106 // and fall through here to test for a valid 64-bit splat. But, then the
7107 // caller would also need to check and handle the change in size.
7108 return SDValue();
7109
7110 case 64: {
7111 if (type != VMOVModImm)
7112 return SDValue();
7113 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
7114 uint64_t BitMask = 0xff;
7115 unsigned ImmMask = 1;
7116 Imm = 0;
7117 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
7118 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
7119 Imm |= ImmMask;
7120 } else if ((SplatBits & BitMask) != 0) {
7121 return SDValue();
7122 }
7123 BitMask <<= 8;
7124 ImmMask <<= 1;
7125 }
7126
7127 // Op=1, Cmode=1110.
7128 OpCmode = 0x1e;
7129 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
7130 break;
7131 }
7132
7133 default:
7134 llvm_unreachable("unexpected size for isVMOVModifiedImm");
7135 }
7136
7137 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
7138 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
7139}
7140
7141SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
7142 const ARMSubtarget *ST) const {
7143 EVT VT = Op.getValueType();
7144 bool IsDouble = (VT == MVT::f64);
7146 const APFloat &FPVal = CFP->getValueAPF();
7147
7148 // Prevent floating-point constants from using literal loads
7149 // when execute-only is enabled.
7150 if (ST->genExecuteOnly()) {
7151 // We shouldn't trigger this for v6m execute-only
7152 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
7153 "Unexpected architecture");
7154
7155 // If we can represent the constant as an immediate, don't lower it
7156 if (isFPImmLegal(FPVal, VT))
7157 return Op;
7158 // Otherwise, construct as integer, and move to float register
7159 APInt INTVal = FPVal.bitcastToAPInt();
7160 SDLoc DL(CFP);
7161 switch (VT.getSimpleVT().SimpleTy) {
7162 default:
7163 llvm_unreachable("Unknown floating point type!");
7164 break;
7165 case MVT::f64: {
7166 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
7167 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
7168 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
7169 }
7170 case MVT::f32:
7171 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
7172 DAG.getConstant(INTVal, DL, MVT::i32));
7173 }
7174 }
7175
7176 if (!ST->hasVFP3Base())
7177 return SDValue();
7178
7179 // Use the default (constant pool) lowering for double constants when we have
7180 // an SP-only FPU
7181 if (IsDouble && !Subtarget->hasFP64())
7182 return SDValue();
7183
7184 // Try splatting with a VMOV.f32...
7185 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
7186
7187 if (ImmVal != -1) {
7188 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
7189 // We have code in place to select a valid ConstantFP already, no need to
7190 // do any mangling.
7191 return Op;
7192 }
7193
7194 // It's a float and we are trying to use NEON operations where
7195 // possible. Lower it to a splat followed by an extract.
7196 SDLoc DL(Op);
7197 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
7198 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
7199 NewVal);
7200 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
7201 DAG.getConstant(0, DL, MVT::i32));
7202 }
7203
7204 // The rest of our options are NEON only, make sure that's allowed before
7205 // proceeding..
7206 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
7207 return SDValue();
7208
7209 EVT VMovVT;
7210 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
7211
7212 // It wouldn't really be worth bothering for doubles except for one very
7213 // important value, which does happen to match: 0.0. So make sure we don't do
7214 // anything stupid.
7215 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
7216 return SDValue();
7217
7218 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
7219 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
7220 VMovVT, VT, VMOVModImm);
7221 if (NewVal != SDValue()) {
7222 SDLoc DL(Op);
7223 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
7224 NewVal);
7225 if (IsDouble)
7226 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7227
7228 // It's a float: cast and extract a vector element.
7229 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7230 VecConstant);
7231 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7232 DAG.getConstant(0, DL, MVT::i32));
7233 }
7234
7235 // Finally, try a VMVN.i32
7236 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
7237 VT, VMVNModImm);
7238 if (NewVal != SDValue()) {
7239 SDLoc DL(Op);
7240 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
7241
7242 if (IsDouble)
7243 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7244
7245 // It's a float: cast and extract a vector element.
7246 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7247 VecConstant);
7248 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7249 DAG.getConstant(0, DL, MVT::i32));
7250 }
7251
7252 return SDValue();
7253}
7254
7255// check if an VEXT instruction can handle the shuffle mask when the
7256// vector sources of the shuffle are the same.
7257static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7258 unsigned NumElts = VT.getVectorNumElements();
7259
7260 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7261 if (M[0] < 0)
7262 return false;
7263
7264 Imm = M[0];
7265
7266 // If this is a VEXT shuffle, the immediate value is the index of the first
7267 // element. The other shuffle indices must be the successive elements after
7268 // the first one.
7269 unsigned ExpectedElt = Imm;
7270 for (unsigned i = 1; i < NumElts; ++i) {
7271 // Increment the expected index. If it wraps around, just follow it
7272 // back to index zero and keep going.
7273 ++ExpectedElt;
7274 if (ExpectedElt == NumElts)
7275 ExpectedElt = 0;
7276
7277 if (M[i] < 0) continue; // ignore UNDEF indices
7278 if (ExpectedElt != static_cast<unsigned>(M[i]))
7279 return false;
7280 }
7281
7282 return true;
7283}
7284
7285static bool isVEXTMask(ArrayRef<int> M, EVT VT,
7286 bool &ReverseVEXT, unsigned &Imm) {
7287 unsigned NumElts = VT.getVectorNumElements();
7288 ReverseVEXT = false;
7289
7290 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7291 if (M[0] < 0)
7292 return false;
7293
7294 Imm = M[0];
7295
7296 // If this is a VEXT shuffle, the immediate value is the index of the first
7297 // element. The other shuffle indices must be the successive elements after
7298 // the first one.
7299 unsigned ExpectedElt = Imm;
7300 for (unsigned i = 1; i < NumElts; ++i) {
7301 // Increment the expected index. If it wraps around, it may still be
7302 // a VEXT but the source vectors must be swapped.
7303 ExpectedElt += 1;
7304 if (ExpectedElt == NumElts * 2) {
7305 ExpectedElt = 0;
7306 ReverseVEXT = true;
7307 }
7308
7309 if (M[i] < 0) continue; // ignore UNDEF indices
7310 if (ExpectedElt != static_cast<unsigned>(M[i]))
7311 return false;
7312 }
7313
7314 // Adjust the index value if the source operands will be swapped.
7315 if (ReverseVEXT)
7316 Imm -= NumElts;
7317
7318 return true;
7319}
7320
7321static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7322 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7323 // range, then 0 is placed into the resulting vector. So pretty much any mask
7324 // of 8 elements can work here.
7325 return VT == MVT::v8i8 && M.size() == 8;
7326}
7327
7328static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7329 unsigned Index) {
7330 if (Mask.size() == Elements * 2)
7331 return Index / Elements;
7332 return Mask[Index] == 0 ? 0 : 1;
7333}
7334
7335// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7336// checking that pairs of elements in the shuffle mask represent the same index
7337// in each vector, incrementing the expected index by 2 at each step.
7338// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7339// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7340// v2={e,f,g,h}
7341// WhichResult gives the offset for each element in the mask based on which
7342// of the two results it belongs to.
7343//
7344// The transpose can be represented either as:
7345// result1 = shufflevector v1, v2, result1_shuffle_mask
7346// result2 = shufflevector v1, v2, result2_shuffle_mask
7347// where v1/v2 and the shuffle masks have the same number of elements
7348// (here WhichResult (see below) indicates which result is being checked)
7349//
7350// or as:
7351// results = shufflevector v1, v2, shuffle_mask
7352// where both results are returned in one vector and the shuffle mask has twice
7353// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7354// want to check the low half and high half of the shuffle mask as if it were
7355// the other case
7356static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7357 unsigned EltSz = VT.getScalarSizeInBits();
7358 if (EltSz == 64)
7359 return false;
7360
7361 unsigned NumElts = VT.getVectorNumElements();
7362 if (M.size() != NumElts && M.size() != NumElts*2)
7363 return false;
7364
7365 // If the mask is twice as long as the input vector then we need to check the
7366 // upper and lower parts of the mask with a matching value for WhichResult
7367 // FIXME: A mask with only even values will be rejected in case the first
7368 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7369 // M[0] is used to determine WhichResult
7370 for (unsigned i = 0; i < M.size(); i += NumElts) {
7371 WhichResult = SelectPairHalf(NumElts, M, i);
7372 for (unsigned j = 0; j < NumElts; j += 2) {
7373 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7374 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7375 return false;
7376 }
7377 }
7378
7379 if (M.size() == NumElts*2)
7380 WhichResult = 0;
7381
7382 return true;
7383}
7384
7385/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7386/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7387/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7388static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7389 unsigned EltSz = VT.getScalarSizeInBits();
7390 if (EltSz == 64)
7391 return false;
7392
7393 unsigned NumElts = VT.getVectorNumElements();
7394 if (M.size() != NumElts && M.size() != NumElts*2)
7395 return false;
7396
7397 for (unsigned i = 0; i < M.size(); i += NumElts) {
7398 WhichResult = SelectPairHalf(NumElts, M, i);
7399 for (unsigned j = 0; j < NumElts; j += 2) {
7400 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7401 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7402 return false;
7403 }
7404 }
7405
7406 if (M.size() == NumElts*2)
7407 WhichResult = 0;
7408
7409 return true;
7410}
7411
7412// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7413// that the mask elements are either all even and in steps of size 2 or all odd
7414// and in steps of size 2.
7415// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7416// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7417// v2={e,f,g,h}
7418// Requires similar checks to that of isVTRNMask with
7419// respect the how results are returned.
7420static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7421 unsigned EltSz = VT.getScalarSizeInBits();
7422 if (EltSz == 64)
7423 return false;
7424
7425 unsigned NumElts = VT.getVectorNumElements();
7426 if (M.size() != NumElts && M.size() != NumElts*2)
7427 return false;
7428
7429 for (unsigned i = 0; i < M.size(); i += NumElts) {
7430 WhichResult = SelectPairHalf(NumElts, M, i);
7431 for (unsigned j = 0; j < NumElts; ++j) {
7432 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7433 return false;
7434 }
7435 }
7436
7437 if (M.size() == NumElts*2)
7438 WhichResult = 0;
7439
7440 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7441 if (VT.is64BitVector() && EltSz == 32)
7442 return false;
7443
7444 return true;
7445}
7446
7447/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7448/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7449/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7450static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7451 unsigned EltSz = VT.getScalarSizeInBits();
7452 if (EltSz == 64)
7453 return false;
7454
7455 unsigned NumElts = VT.getVectorNumElements();
7456 if (M.size() != NumElts && M.size() != NumElts*2)
7457 return false;
7458
7459 unsigned Half = NumElts / 2;
7460 for (unsigned i = 0; i < M.size(); i += NumElts) {
7461 WhichResult = SelectPairHalf(NumElts, M, i);
7462 for (unsigned j = 0; j < NumElts; j += Half) {
7463 unsigned Idx = WhichResult;
7464 for (unsigned k = 0; k < Half; ++k) {
7465 int MIdx = M[i + j + k];
7466 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7467 return false;
7468 Idx += 2;
7469 }
7470 }
7471 }
7472
7473 if (M.size() == NumElts*2)
7474 WhichResult = 0;
7475
7476 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7477 if (VT.is64BitVector() && EltSz == 32)
7478 return false;
7479
7480 return true;
7481}
7482
7483// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7484// that pairs of elements of the shufflemask represent the same index in each
7485// vector incrementing sequentially through the vectors.
7486// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7487// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7488// v2={e,f,g,h}
7489// Requires similar checks to that of isVTRNMask with respect the how results
7490// are returned.
7491static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7492 unsigned EltSz = VT.getScalarSizeInBits();
7493 if (EltSz == 64)
7494 return false;
7495
7496 unsigned NumElts = VT.getVectorNumElements();
7497 if (M.size() != NumElts && M.size() != NumElts*2)
7498 return false;
7499
7500 for (unsigned i = 0; i < M.size(); i += NumElts) {
7501 WhichResult = SelectPairHalf(NumElts, M, i);
7502 unsigned Idx = WhichResult * NumElts / 2;
7503 for (unsigned j = 0; j < NumElts; j += 2) {
7504 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7505 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7506 return false;
7507 Idx += 1;
7508 }
7509 }
7510
7511 if (M.size() == NumElts*2)
7512 WhichResult = 0;
7513
7514 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7515 if (VT.is64BitVector() && EltSz == 32)
7516 return false;
7517
7518 return true;
7519}
7520
7521/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7522/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7523/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7524static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7525 unsigned EltSz = VT.getScalarSizeInBits();
7526 if (EltSz == 64)
7527 return false;
7528
7529 unsigned NumElts = VT.getVectorNumElements();
7530 if (M.size() != NumElts && M.size() != NumElts*2)
7531 return false;
7532
7533 for (unsigned i = 0; i < M.size(); i += NumElts) {
7534 WhichResult = SelectPairHalf(NumElts, M, i);
7535 unsigned Idx = WhichResult * NumElts / 2;
7536 for (unsigned j = 0; j < NumElts; j += 2) {
7537 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7538 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7539 return false;
7540 Idx += 1;
7541 }
7542 }
7543
7544 if (M.size() == NumElts*2)
7545 WhichResult = 0;
7546
7547 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7548 if (VT.is64BitVector() && EltSz == 32)
7549 return false;
7550
7551 return true;
7552}
7553
7554/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7555/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7556static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7557 unsigned &WhichResult,
7558 bool &isV_UNDEF) {
7559 isV_UNDEF = false;
7560 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7561 return ARMISD::VTRN;
7562 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7563 return ARMISD::VUZP;
7564 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7565 return ARMISD::VZIP;
7566
7567 isV_UNDEF = true;
7568 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7569 return ARMISD::VTRN;
7570 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7571 return ARMISD::VUZP;
7572 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7573 return ARMISD::VZIP;
7574
7575 return 0;
7576}
7577
7578/// \return true if this is a reverse operation on an vector.
7579static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7580 unsigned NumElts = VT.getVectorNumElements();
7581 // Make sure the mask has the right size.
7582 if (NumElts != M.size())
7583 return false;
7584
7585 // Look for <15, ..., 3, -1, 1, 0>.
7586 for (unsigned i = 0; i != NumElts; ++i)
7587 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7588 return false;
7589
7590 return true;
7591}
7592
7593static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7594 unsigned NumElts = VT.getVectorNumElements();
7595 // Make sure the mask has the right size.
7596 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7597 return false;
7598
7599 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7600 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7601 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7602 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7603 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7604 int Ofs = Top ? 1 : 0;
7605 int Upper = SingleSource ? 0 : NumElts;
7606 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7607 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7608 return false;
7609 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7610 return false;
7611 }
7612 return true;
7613}
7614
7615static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7616 unsigned NumElts = VT.getVectorNumElements();
7617 // Make sure the mask has the right size.
7618 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7619 return false;
7620
7621 // If Top
7622 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7623 // This inserts Input2 into Input1
7624 // else if not Top
7625 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7626 // This inserts Input1 into Input2
7627 unsigned Offset = Top ? 0 : 1;
7628 unsigned N = SingleSource ? 0 : NumElts;
7629 for (unsigned i = 0; i < NumElts; i += 2) {
7630 if (M[i] >= 0 && M[i] != (int)i)
7631 return false;
7632 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7633 return false;
7634 }
7635
7636 return true;
7637}
7638
7639static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7640 unsigned NumElts = ToVT.getVectorNumElements();
7641 if (NumElts != M.size())
7642 return false;
7643
7644 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7645 // looking for patterns of:
7646 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7647 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7648
7649 unsigned Off0 = rev ? NumElts / 2 : 0;
7650 unsigned Off1 = rev ? 0 : NumElts / 2;
7651 for (unsigned i = 0; i < NumElts; i += 2) {
7652 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7653 return false;
7654 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7655 return false;
7656 }
7657
7658 return true;
7659}
7660
7661// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7662// from a pair of inputs. For example:
7663// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7664// FP_ROUND(EXTRACT_ELT(Y, 0),
7665// FP_ROUND(EXTRACT_ELT(X, 1),
7666// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7668 const ARMSubtarget *ST) {
7669 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7670 if (!ST->hasMVEFloatOps())
7671 return SDValue();
7672
7673 SDLoc dl(BV);
7674 EVT VT = BV.getValueType();
7675 if (VT != MVT::v8f16)
7676 return SDValue();
7677
7678 // We are looking for a buildvector of fptrunc elements, where all the
7679 // elements are interleavingly extracted from two sources. Check the first two
7680 // items are valid enough and extract some info from them (they are checked
7681 // properly in the loop below).
7682 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7685 return SDValue();
7686 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7689 return SDValue();
7690 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7691 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7692 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7693 return SDValue();
7694
7695 // Check all the values in the BuildVector line up with our expectations.
7696 for (unsigned i = 1; i < 4; i++) {
7697 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7698 return Trunc.getOpcode() == ISD::FP_ROUND &&
7700 Trunc.getOperand(0).getOperand(0) == Op &&
7701 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7702 };
7703 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7704 return SDValue();
7705 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7706 return SDValue();
7707 }
7708
7709 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7710 DAG.getConstant(0, dl, MVT::i32));
7711 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7712 DAG.getConstant(1, dl, MVT::i32));
7713}
7714
7715// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7716// from a single input on alternating lanes. For example:
7717// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7718// FP_ROUND(EXTRACT_ELT(X, 2),
7719// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7721 const ARMSubtarget *ST) {
7722 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7723 if (!ST->hasMVEFloatOps())
7724 return SDValue();
7725
7726 SDLoc dl(BV);
7727 EVT VT = BV.getValueType();
7728 if (VT != MVT::v4f32)
7729 return SDValue();
7730
7731 // We are looking for a buildvector of fptext elements, where all the
7732 // elements are alternating lanes from a single source. For example <0,2,4,6>
7733 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7734 // info from them (they are checked properly in the loop below).
7735 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7737 return SDValue();
7738 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7740 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7741 return SDValue();
7742
7743 // Check all the values in the BuildVector line up with our expectations.
7744 for (unsigned i = 1; i < 4; i++) {
7745 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7746 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7748 Trunc.getOperand(0).getOperand(0) == Op &&
7749 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7750 };
7751 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7752 return SDValue();
7753 }
7754
7755 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7756 DAG.getConstant(Offset, dl, MVT::i32));
7757}
7758
7759// If N is an integer constant that can be moved into a register in one
7760// instruction, return an SDValue of such a constant (will become a MOV
7761// instruction). Otherwise return null.
7763 const ARMSubtarget *ST, const SDLoc &dl) {
7764 uint64_t Val;
7765 if (!isa<ConstantSDNode>(N))
7766 return SDValue();
7767 Val = N->getAsZExtVal();
7768
7769 if (ST->isThumb1Only()) {
7770 if (Val <= 255 || ~Val <= 255)
7771 return DAG.getConstant(Val, dl, MVT::i32);
7772 } else {
7773 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7774 return DAG.getConstant(Val, dl, MVT::i32);
7775 }
7776 return SDValue();
7777}
7778
7780 const ARMSubtarget *ST) {
7781 SDLoc dl(Op);
7782 EVT VT = Op.getValueType();
7783
7784 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7785
7786 unsigned NumElts = VT.getVectorNumElements();
7787 unsigned BoolMask;
7788 unsigned BitsPerBool;
7789 if (NumElts == 2) {
7790 BitsPerBool = 8;
7791 BoolMask = 0xff;
7792 } else if (NumElts == 4) {
7793 BitsPerBool = 4;
7794 BoolMask = 0xf;
7795 } else if (NumElts == 8) {
7796 BitsPerBool = 2;
7797 BoolMask = 0x3;
7798 } else if (NumElts == 16) {
7799 BitsPerBool = 1;
7800 BoolMask = 0x1;
7801 } else
7802 return SDValue();
7803
7804 // If this is a single value copied into all lanes (a splat), we can just sign
7805 // extend that single value
7806 SDValue FirstOp = Op.getOperand(0);
7807 if (!isa<ConstantSDNode>(FirstOp) &&
7808 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7809 return U.get().isUndef() || U.get() == FirstOp;
7810 })) {
7811 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7812 DAG.getValueType(MVT::i1));
7813 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7814 }
7815
7816 // First create base with bits set where known
7817 unsigned Bits32 = 0;
7818 for (unsigned i = 0; i < NumElts; ++i) {
7819 SDValue V = Op.getOperand(i);
7820 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7821 continue;
7822 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7823 if (BitSet)
7824 Bits32 |= BoolMask << (i * BitsPerBool);
7825 }
7826
7827 // Add in unknown nodes
7829 DAG.getConstant(Bits32, dl, MVT::i32));
7830 for (unsigned i = 0; i < NumElts; ++i) {
7831 SDValue V = Op.getOperand(i);
7832 if (isa<ConstantSDNode>(V) || V.isUndef())
7833 continue;
7834 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7835 DAG.getConstant(i, dl, MVT::i32));
7836 }
7837
7838 return Base;
7839}
7840
7842 const ARMSubtarget *ST) {
7843 if (!ST->hasMVEIntegerOps())
7844 return SDValue();
7845
7846 // We are looking for a buildvector where each element is Op[0] + i*N
7847 EVT VT = Op.getValueType();
7848 SDValue Op0 = Op.getOperand(0);
7849 unsigned NumElts = VT.getVectorNumElements();
7850
7851 // Get the increment value from operand 1
7852 SDValue Op1 = Op.getOperand(1);
7853 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7855 return SDValue();
7856 unsigned N = Op1.getConstantOperandVal(1);
7857 if (N != 1 && N != 2 && N != 4 && N != 8)
7858 return SDValue();
7859
7860 // Check that each other operand matches
7861 for (unsigned I = 2; I < NumElts; I++) {
7862 SDValue OpI = Op.getOperand(I);
7863 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7865 OpI.getConstantOperandVal(1) != I * N)
7866 return SDValue();
7867 }
7868
7869 SDLoc DL(Op);
7870 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7871 DAG.getConstant(N, DL, MVT::i32));
7872}
7873
7874// Returns true if the operation N can be treated as qr instruction variant at
7875// operand Op.
7876static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7877 switch (N->getOpcode()) {
7878 case ISD::ADD:
7879 case ISD::MUL:
7880 case ISD::SADDSAT:
7881 case ISD::UADDSAT:
7882 return true;
7883 case ISD::SUB:
7884 case ISD::SSUBSAT:
7885 case ISD::USUBSAT:
7886 return N->getOperand(1).getNode() == Op;
7888 switch (N->getConstantOperandVal(0)) {
7889 case Intrinsic::arm_mve_add_predicated:
7890 case Intrinsic::arm_mve_mul_predicated:
7891 case Intrinsic::arm_mve_qadd_predicated:
7892 case Intrinsic::arm_mve_vhadd:
7893 case Intrinsic::arm_mve_hadd_predicated:
7894 case Intrinsic::arm_mve_vqdmulh:
7895 case Intrinsic::arm_mve_qdmulh_predicated:
7896 case Intrinsic::arm_mve_vqrdmulh:
7897 case Intrinsic::arm_mve_qrdmulh_predicated:
7898 case Intrinsic::arm_mve_vqdmull:
7899 case Intrinsic::arm_mve_vqdmull_predicated:
7900 return true;
7901 case Intrinsic::arm_mve_sub_predicated:
7902 case Intrinsic::arm_mve_qsub_predicated:
7903 case Intrinsic::arm_mve_vhsub:
7904 case Intrinsic::arm_mve_hsub_predicated:
7905 return N->getOperand(2).getNode() == Op;
7906 default:
7907 return false;
7908 }
7909 default:
7910 return false;
7911 }
7912}
7913
7914// If this is a case we can't handle, return null and let the default
7915// expansion code take care of it.
7916SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7917 const ARMSubtarget *ST) const {
7919 SDLoc dl(Op);
7920 EVT VT = Op.getValueType();
7921
7922 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7923 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7924
7925 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7926 return R;
7927
7928 APInt SplatBits, SplatUndef;
7929 unsigned SplatBitSize;
7930 bool HasAnyUndefs;
7931 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7932 if (SplatUndef.isAllOnes())
7933 return DAG.getUNDEF(VT);
7934
7935 // If all the users of this constant splat are qr instruction variants,
7936 // generate a vdup of the constant.
7937 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7938 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7939 all_of(BVN->uses(),
7940 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7941 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7942 : SplatBitSize == 16 ? MVT::v8i16
7943 : MVT::v16i8;
7944 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7945 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7946 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7947 }
7948
7949 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7950 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7951 // Check if an immediate VMOV works.
7952 EVT VmovVT;
7953 SDValue Val =
7954 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7955 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7956
7957 if (Val.getNode()) {
7958 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7959 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7960 }
7961
7962 // Try an immediate VMVN.
7963 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7964 Val = isVMOVModifiedImm(
7965 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7966 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7967 if (Val.getNode()) {
7968 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7969 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7970 }
7971
7972 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7973 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7974 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7975 if (ImmVal != -1) {
7976 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7977 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7978 }
7979 }
7980
7981 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7982 // type.
7983 if (ST->hasMVEIntegerOps() &&
7984 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7985 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7986 : SplatBitSize == 16 ? MVT::v8i16
7987 : MVT::v16i8;
7988 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7989 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7990 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7991 }
7992 }
7993 }
7994
7995 // Scan through the operands to see if only one value is used.
7996 //
7997 // As an optimisation, even if more than one value is used it may be more
7998 // profitable to splat with one value then change some lanes.
7999 //
8000 // Heuristically we decide to do this if the vector has a "dominant" value,
8001 // defined as splatted to more than half of the lanes.
8002 unsigned NumElts = VT.getVectorNumElements();
8003 bool isOnlyLowElement = true;
8004 bool usesOnlyOneValue = true;
8005 bool hasDominantValue = false;
8006 bool isConstant = true;
8007
8008 // Map of the number of times a particular SDValue appears in the
8009 // element list.
8010 DenseMap<SDValue, unsigned> ValueCounts;
8011 SDValue Value;
8012 for (unsigned i = 0; i < NumElts; ++i) {
8013 SDValue V = Op.getOperand(i);
8014 if (V.isUndef())
8015 continue;
8016 if (i > 0)
8017 isOnlyLowElement = false;
8019 isConstant = false;
8020
8021 unsigned &Count = ValueCounts[V];
8022
8023 // Is this value dominant? (takes up more than half of the lanes)
8024 if (++Count > (NumElts / 2)) {
8025 hasDominantValue = true;
8026 Value = V;
8027 }
8028 }
8029 if (ValueCounts.size() != 1)
8030 usesOnlyOneValue = false;
8031 if (!Value.getNode() && !ValueCounts.empty())
8032 Value = ValueCounts.begin()->first;
8033
8034 if (ValueCounts.empty())
8035 return DAG.getUNDEF(VT);
8036
8037 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
8038 // Keep going if we are hitting this case.
8039 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
8040 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
8041
8042 unsigned EltSize = VT.getScalarSizeInBits();
8043
8044 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
8045 // i32 and try again.
8046 if (hasDominantValue && EltSize <= 32) {
8047 if (!isConstant) {
8048 SDValue N;
8049
8050 // If we are VDUPing a value that comes directly from a vector, that will
8051 // cause an unnecessary move to and from a GPR, where instead we could
8052 // just use VDUPLANE. We can only do this if the lane being extracted
8053 // is at a constant index, as the VDUP from lane instructions only have
8054 // constant-index forms.
8055 ConstantSDNode *constIndex;
8056 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8057 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
8058 // We need to create a new undef vector to use for the VDUPLANE if the
8059 // size of the vector from which we get the value is different than the
8060 // size of the vector that we need to create. We will insert the element
8061 // such that the register coalescer will remove unnecessary copies.
8062 if (VT != Value->getOperand(0).getValueType()) {
8063 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
8065 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8066 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
8067 Value, DAG.getConstant(index, dl, MVT::i32)),
8068 DAG.getConstant(index, dl, MVT::i32));
8069 } else
8070 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8071 Value->getOperand(0), Value->getOperand(1));
8072 } else
8073 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
8074
8075 if (!usesOnlyOneValue) {
8076 // The dominant value was splatted as 'N', but we now have to insert
8077 // all differing elements.
8078 for (unsigned I = 0; I < NumElts; ++I) {
8079 if (Op.getOperand(I) == Value)
8080 continue;
8082 Ops.push_back(N);
8083 Ops.push_back(Op.getOperand(I));
8084 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
8085 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
8086 }
8087 }
8088 return N;
8089 }
8093 assert(FVT == MVT::f32 || FVT == MVT::f16);
8094 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
8095 for (unsigned i = 0; i < NumElts; ++i)
8096 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
8097 Op.getOperand(i)));
8098 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
8099 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
8100 Val = LowerBUILD_VECTOR(Val, DAG, ST);
8101 if (Val.getNode())
8102 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8103 }
8104 if (usesOnlyOneValue) {
8105 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
8106 if (isConstant && Val.getNode())
8107 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
8108 }
8109 }
8110
8111 // If all elements are constants and the case above didn't get hit, fall back
8112 // to the default expansion, which will generate a load from the constant
8113 // pool.
8114 if (isConstant)
8115 return SDValue();
8116
8117 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
8118 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
8119 // length <= 2.
8120 if (NumElts >= 4)
8121 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
8122 return shuffle;
8123
8124 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
8125 // VCVT's
8126 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
8127 return VCVT;
8128 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
8129 return VCVT;
8130
8131 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
8132 // If we haven't found an efficient lowering, try splitting a 128-bit vector
8133 // into two 64-bit vectors; we might discover a better way to lower it.
8134 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
8135 EVT ExtVT = VT.getVectorElementType();
8136 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
8137 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
8138 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
8139 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
8140 SDValue Upper =
8141 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
8142 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
8143 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
8144 if (Lower && Upper)
8145 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
8146 }
8147
8148 // Vectors with 32- or 64-bit elements can be built by directly assigning
8149 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
8150 // will be legalized.
8151 if (EltSize >= 32) {
8152 // Do the expansion with floating-point types, since that is what the VFP
8153 // registers are defined to use, and since i64 is not legal.
8154 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8155 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8157 for (unsigned i = 0; i < NumElts; ++i)
8158 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
8159 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8160 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8161 }
8162
8163 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
8164 // know the default expansion would otherwise fall back on something even
8165 // worse. For a vector with one or two non-undef values, that's
8166 // scalar_to_vector for the elements followed by a shuffle (provided the
8167 // shuffle is valid for the target) and materialization element by element
8168 // on the stack followed by a load for everything else.
8169 if (!isConstant && !usesOnlyOneValue) {
8170 SDValue Vec = DAG.getUNDEF(VT);
8171 for (unsigned i = 0 ; i < NumElts; ++i) {
8172 SDValue V = Op.getOperand(i);
8173 if (V.isUndef())
8174 continue;
8175 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
8176 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
8177 }
8178 return Vec;
8179 }
8180
8181 return SDValue();
8182}
8183
8184// Gather data to see if the operation can be modelled as a
8185// shuffle in combination with VEXTs.
8186SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
8187 SelectionDAG &DAG) const {
8188 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
8189 SDLoc dl(Op);
8190 EVT VT = Op.getValueType();
8191 unsigned NumElts = VT.getVectorNumElements();
8192
8193 struct ShuffleSourceInfo {
8194 SDValue Vec;
8195 unsigned MinElt = std::numeric_limits<unsigned>::max();
8196 unsigned MaxElt = 0;
8197
8198 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
8199 // be compatible with the shuffle we intend to construct. As a result
8200 // ShuffleVec will be some sliding window into the original Vec.
8201 SDValue ShuffleVec;
8202
8203 // Code should guarantee that element i in Vec starts at element "WindowBase
8204 // + i * WindowScale in ShuffleVec".
8205 int WindowBase = 0;
8206 int WindowScale = 1;
8207
8208 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
8209
8210 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
8211 };
8212
8213 // First gather all vectors used as an immediate source for this BUILD_VECTOR
8214 // node.
8216 for (unsigned i = 0; i < NumElts; ++i) {
8217 SDValue V = Op.getOperand(i);
8218 if (V.isUndef())
8219 continue;
8220 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
8221 // A shuffle can only come from building a vector from various
8222 // elements of other vectors.
8223 return SDValue();
8224 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
8225 // Furthermore, shuffles require a constant mask, whereas extractelts
8226 // accept variable indices.
8227 return SDValue();
8228 }
8229
8230 // Add this element source to the list if it's not already there.
8231 SDValue SourceVec = V.getOperand(0);
8232 auto Source = llvm::find(Sources, SourceVec);
8233 if (Source == Sources.end())
8234 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
8235
8236 // Update the minimum and maximum lane number seen.
8237 unsigned EltNo = V.getConstantOperandVal(1);
8238 Source->MinElt = std::min(Source->MinElt, EltNo);
8239 Source->MaxElt = std::max(Source->MaxElt, EltNo);
8240 }
8241
8242 // Currently only do something sane when at most two source vectors
8243 // are involved.
8244 if (Sources.size() > 2)
8245 return SDValue();
8246
8247 // Find out the smallest element size among result and two sources, and use
8248 // it as element size to build the shuffle_vector.
8249 EVT SmallestEltTy = VT.getVectorElementType();
8250 for (auto &Source : Sources) {
8251 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8252 if (SrcEltTy.bitsLT(SmallestEltTy))
8253 SmallestEltTy = SrcEltTy;
8254 }
8255 unsigned ResMultiplier =
8256 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
8257 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
8258 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
8259
8260 // If the source vector is too wide or too narrow, we may nevertheless be able
8261 // to construct a compatible shuffle either by concatenating it with UNDEF or
8262 // extracting a suitable range of elements.
8263 for (auto &Src : Sources) {
8264 EVT SrcVT = Src.ShuffleVec.getValueType();
8265
8266 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8267 uint64_t VTSize = VT.getFixedSizeInBits();
8268 if (SrcVTSize == VTSize)
8269 continue;
8270
8271 // This stage of the search produces a source with the same element type as
8272 // the original, but with a total width matching the BUILD_VECTOR output.
8273 EVT EltVT = SrcVT.getVectorElementType();
8274 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8275 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8276
8277 if (SrcVTSize < VTSize) {
8278 if (2 * SrcVTSize != VTSize)
8279 return SDValue();
8280 // We can pad out the smaller vector for free, so if it's part of a
8281 // shuffle...
8282 Src.ShuffleVec =
8283 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8284 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8285 continue;
8286 }
8287
8288 if (SrcVTSize != 2 * VTSize)
8289 return SDValue();
8290
8291 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8292 // Span too large for a VEXT to cope
8293 return SDValue();
8294 }
8295
8296 if (Src.MinElt >= NumSrcElts) {
8297 // The extraction can just take the second half
8298 Src.ShuffleVec =
8299 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8300 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8301 Src.WindowBase = -NumSrcElts;
8302 } else if (Src.MaxElt < NumSrcElts) {
8303 // The extraction can just take the first half
8304 Src.ShuffleVec =
8305 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8306 DAG.getConstant(0, dl, MVT::i32));
8307 } else {
8308 // An actual VEXT is needed
8309 SDValue VEXTSrc1 =
8310 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8311 DAG.getConstant(0, dl, MVT::i32));
8312 SDValue VEXTSrc2 =
8313 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8314 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8315
8316 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8317 VEXTSrc2,
8318 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8319 Src.WindowBase = -Src.MinElt;
8320 }
8321 }
8322
8323 // Another possible incompatibility occurs from the vector element types. We
8324 // can fix this by bitcasting the source vectors to the same type we intend
8325 // for the shuffle.
8326 for (auto &Src : Sources) {
8327 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8328 if (SrcEltTy == SmallestEltTy)
8329 continue;
8330 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8331 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8332 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8333 Src.WindowBase *= Src.WindowScale;
8334 }
8335
8336 // Final check before we try to actually produce a shuffle.
8337 LLVM_DEBUG(for (auto Src
8338 : Sources)
8339 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
8340
8341 // The stars all align, our next step is to produce the mask for the shuffle.
8343 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8344 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8345 SDValue Entry = Op.getOperand(i);
8346 if (Entry.isUndef())
8347 continue;
8348
8349 auto Src = llvm::find(Sources, Entry.getOperand(0));
8350 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8351
8352 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8353 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8354 // segment.
8355 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8356 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8357 VT.getScalarSizeInBits());
8358 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8359
8360 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8361 // starting at the appropriate offset.
8362 int *LaneMask = &Mask[i * ResMultiplier];
8363
8364 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8365 ExtractBase += NumElts * (Src - Sources.begin());
8366 for (int j = 0; j < LanesDefined; ++j)
8367 LaneMask[j] = ExtractBase + j;
8368 }
8369
8370
8371 // We can't handle more than two sources. This should have already
8372 // been checked before this point.
8373 assert(Sources.size() <= 2 && "Too many sources!");
8374
8375 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8376 for (unsigned i = 0; i < Sources.size(); ++i)
8377 ShuffleOps[i] = Sources[i].ShuffleVec;
8378
8379 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8380 ShuffleOps[1], Mask, DAG);
8381 if (!Shuffle)
8382 return SDValue();
8383 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8384}
8385
8387 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8396 OP_VUZPL, // VUZP, left result
8397 OP_VUZPR, // VUZP, right result
8398 OP_VZIPL, // VZIP, left result
8399 OP_VZIPR, // VZIP, right result
8400 OP_VTRNL, // VTRN, left result
8401 OP_VTRNR // VTRN, right result
8403
8404static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8405 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8406 switch (OpNum) {
8407 case OP_COPY:
8408 case OP_VREV:
8409 case OP_VDUP0:
8410 case OP_VDUP1:
8411 case OP_VDUP2:
8412 case OP_VDUP3:
8413 return true;
8414 }
8415 return false;
8416}
8417
8418/// isShuffleMaskLegal - Targets can use this to indicate that they only
8419/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8420/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8421/// are assumed to be legal.
8423 if (VT.getVectorNumElements() == 4 &&
8424 (VT.is128BitVector() || VT.is64BitVector())) {
8425 unsigned PFIndexes[4];
8426 for (unsigned i = 0; i != 4; ++i) {
8427 if (M[i] < 0)
8428 PFIndexes[i] = 8;
8429 else
8430 PFIndexes[i] = M[i];
8431 }
8432
8433 // Compute the index in the perfect shuffle table.
8434 unsigned PFTableIndex =
8435 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8436 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8437 unsigned Cost = (PFEntry >> 30);
8438
8439 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8440 return true;
8441 }
8442
8443 bool ReverseVEXT, isV_UNDEF;
8444 unsigned Imm, WhichResult;
8445
8446 unsigned EltSize = VT.getScalarSizeInBits();
8447 if (EltSize >= 32 ||
8449 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8450 isVREVMask(M, VT, 64) ||
8451 isVREVMask(M, VT, 32) ||
8452 isVREVMask(M, VT, 16))
8453 return true;
8454 else if (Subtarget->hasNEON() &&
8455 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8456 isVTBLMask(M, VT) ||
8457 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8458 return true;
8459 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8460 isReverseMask(M, VT))
8461 return true;
8462 else if (Subtarget->hasMVEIntegerOps() &&
8463 (isVMOVNMask(M, VT, true, false) ||
8464 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8465 return true;
8466 else if (Subtarget->hasMVEIntegerOps() &&
8467 (isTruncMask(M, VT, false, false) ||
8468 isTruncMask(M, VT, false, true) ||
8469 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8470 return true;
8471 else
8472 return false;
8473}
8474
8475/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8476/// the specified operations to build the shuffle.
8477static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
8478 SDValue RHS, SelectionDAG &DAG,
8479 const SDLoc &dl) {
8480 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8481 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8482 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8483
8484 if (OpNum == OP_COPY) {
8485 if (LHSID == (1*9+2)*9+3) return LHS;
8486 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8487 return RHS;
8488 }
8489
8490 SDValue OpLHS, OpRHS;
8491 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8492 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8493 EVT VT = OpLHS.getValueType();
8494
8495 switch (OpNum) {
8496 default: llvm_unreachable("Unknown shuffle opcode!");
8497 case OP_VREV:
8498 // VREV divides the vector in half and swaps within the half.
8499 if (VT.getScalarSizeInBits() == 32)
8500 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8501 // vrev <4 x i16> -> VREV32
8502 if (VT.getScalarSizeInBits() == 16)
8503 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8504 // vrev <4 x i8> -> VREV16
8505 assert(VT.getScalarSizeInBits() == 8);
8506 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8507 case OP_VDUP0:
8508 case OP_VDUP1:
8509 case OP_VDUP2:
8510 case OP_VDUP3:
8511 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8512 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8513 case OP_VEXT1:
8514 case OP_VEXT2:
8515 case OP_VEXT3:
8516 return DAG.getNode(ARMISD::VEXT, dl, VT,
8517 OpLHS, OpRHS,
8518 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8519 case OP_VUZPL:
8520 case OP_VUZPR:
8521 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8522 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8523 case OP_VZIPL:
8524 case OP_VZIPR:
8525 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8526 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8527 case OP_VTRNL:
8528 case OP_VTRNR:
8529 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8530 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8531 }
8532}
8533
8535 ArrayRef<int> ShuffleMask,
8536 SelectionDAG &DAG) {
8537 // Check to see if we can use the VTBL instruction.
8538 SDValue V1 = Op.getOperand(0);
8539 SDValue V2 = Op.getOperand(1);
8540 SDLoc DL(Op);
8541
8542 SmallVector<SDValue, 8> VTBLMask;
8543 for (int I : ShuffleMask)
8544 VTBLMask.push_back(DAG.getSignedConstant(I, DL, MVT::i32));
8545
8546 if (V2.getNode()->isUndef())
8547 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8548 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8549
8550 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8551 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8552}
8553
8555 SDLoc DL(Op);
8556 EVT VT = Op.getValueType();
8557
8558 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8559 "Expect an v8i16/v16i8 type");
8560 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8561 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8562 // extract the first 8 bytes into the top double word and the last 8 bytes
8563 // into the bottom double word, through a new vector shuffle that will be
8564 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8565 std::vector<int> NewMask;
8566 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8567 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8568 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8569 NewMask.push_back(i);
8570 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8571}
8572
8574 switch (VT.getSimpleVT().SimpleTy) {
8575 case MVT::v2i1:
8576 return MVT::v2f64;
8577 case MVT::v4i1:
8578 return MVT::v4i32;
8579 case MVT::v8i1:
8580 return MVT::v8i16;
8581 case MVT::v16i1:
8582 return MVT::v16i8;
8583 default:
8584 llvm_unreachable("Unexpected vector predicate type");
8585 }
8586}
8587
8589 SelectionDAG &DAG) {
8590 // Converting from boolean predicates to integers involves creating a vector
8591 // of all ones or all zeroes and selecting the lanes based upon the real
8592 // predicate.
8594 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8595 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8596
8597 SDValue AllZeroes =
8598 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8599 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8600
8601 // Get full vector type from predicate type
8603
8604 SDValue RecastV1;
8605 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8606 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8607 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8608 // since we know in hardware the sizes are really the same.
8609 if (VT != MVT::v16i1)
8610 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8611 else
8612 RecastV1 = Pred;
8613
8614 // Select either all ones or zeroes depending upon the real predicate bits.
8615 SDValue PredAsVector =
8616 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8617
8618 // Recast our new predicate-as-integer v16i8 vector into something
8619 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8620 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8621}
8622
8624 const ARMSubtarget *ST) {
8625 EVT VT = Op.getValueType();
8627 ArrayRef<int> ShuffleMask = SVN->getMask();
8628
8629 assert(ST->hasMVEIntegerOps() &&
8630 "No support for vector shuffle of boolean predicates");
8631
8632 SDValue V1 = Op.getOperand(0);
8633 SDValue V2 = Op.getOperand(1);
8634 SDLoc dl(Op);
8635 if (isReverseMask(ShuffleMask, VT)) {
8636 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8637 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8638 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8639 DAG.getConstant(16, dl, MVT::i32));
8640 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8641 }
8642
8643 // Until we can come up with optimised cases for every single vector
8644 // shuffle in existence we have chosen the least painful strategy. This is
8645 // to essentially promote the boolean predicate to a 8-bit integer, where
8646 // each predicate represents a byte. Then we fall back on a normal integer
8647 // vector shuffle and convert the result back into a predicate vector. In
8648 // many cases the generated code might be even better than scalar code
8649 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8650 // fields in a register into 8 other arbitrary 2-bit fields!
8651 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8652 EVT NewVT = PredAsVector1.getValueType();
8653 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8654 : PromoteMVEPredVector(dl, V2, VT, DAG);
8655 assert(PredAsVector2.getValueType() == NewVT &&
8656 "Expected identical vector type in expanded i1 shuffle!");
8657
8658 // Do the shuffle!
8659 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8660 PredAsVector2, ShuffleMask);
8661
8662 // Now return the result of comparing the shuffled vector with zero,
8663 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8664 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8665 if (VT == MVT::v2i1) {
8666 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8667 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8668 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8669 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8670 }
8671 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8672 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8673}
8674
8676 ArrayRef<int> ShuffleMask,
8677 SelectionDAG &DAG) {
8678 // Attempt to lower the vector shuffle using as many whole register movs as
8679 // possible. This is useful for types smaller than 32bits, which would
8680 // often otherwise become a series for grp movs.
8681 SDLoc dl(Op);
8682 EVT VT = Op.getValueType();
8683 if (VT.getScalarSizeInBits() >= 32)
8684 return SDValue();
8685
8686 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8687 "Unexpected vector type");
8688 int NumElts = VT.getVectorNumElements();
8689 int QuarterSize = NumElts / 4;
8690 // The four final parts of the vector, as i32's
8691 SDValue Parts[4];
8692
8693 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8694 // <u,u,u,u>), returning the vmov lane index
8695 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8696 // Detect which mov lane this would be from the first non-undef element.
8697 int MovIdx = -1;
8698 for (int i = 0; i < Length; i++) {
8699 if (ShuffleMask[Start + i] >= 0) {
8700 if (ShuffleMask[Start + i] % Length != i)
8701 return -1;
8702 MovIdx = ShuffleMask[Start + i] / Length;
8703 break;
8704 }
8705 }
8706 // If all items are undef, leave this for other combines
8707 if (MovIdx == -1)
8708 return -1;
8709 // Check the remaining values are the correct part of the same mov
8710 for (int i = 1; i < Length; i++) {
8711 if (ShuffleMask[Start + i] >= 0 &&
8712 (ShuffleMask[Start + i] / Length != MovIdx ||
8713 ShuffleMask[Start + i] % Length != i))
8714 return -1;
8715 }
8716 return MovIdx;
8717 };
8718
8719 for (int Part = 0; Part < 4; ++Part) {
8720 // Does this part look like a mov
8721 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8722 if (Elt != -1) {
8723 SDValue Input = Op->getOperand(0);
8724 if (Elt >= 4) {
8725 Input = Op->getOperand(1);
8726 Elt -= 4;
8727 }
8728 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8729 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8730 DAG.getConstant(Elt, dl, MVT::i32));
8731 }
8732 }
8733
8734 // Nothing interesting found, just return
8735 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8736 return SDValue();
8737
8738 // The other parts need to be built with the old shuffle vector, cast to a
8739 // v4i32 and extract_vector_elts
8740 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8741 SmallVector<int, 16> NewShuffleMask;
8742 for (int Part = 0; Part < 4; ++Part)
8743 for (int i = 0; i < QuarterSize; i++)
8744 NewShuffleMask.push_back(
8745 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8746 SDValue NewShuffle = DAG.getVectorShuffle(
8747 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8748 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8749
8750 for (int Part = 0; Part < 4; ++Part)
8751 if (!Parts[Part])
8752 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8753 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8754 }
8755 // Build a vector out of the various parts and bitcast it back to the original
8756 // type.
8757 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8758 return DAG.getBitcast(VT, NewVec);
8759}
8760
8762 ArrayRef<int> ShuffleMask,
8763 SelectionDAG &DAG) {
8764 SDValue V1 = Op.getOperand(0);
8765 SDValue V2 = Op.getOperand(1);
8766 EVT VT = Op.getValueType();
8767 unsigned NumElts = VT.getVectorNumElements();
8768
8769 // An One-Off Identity mask is one that is mostly an identity mask from as
8770 // single source but contains a single element out-of-place, either from a
8771 // different vector or from another position in the same vector. As opposed to
8772 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8773 // pair directly.
8774 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8775 int &OffElement) {
8776 OffElement = -1;
8777 int NonUndef = 0;
8778 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8779 if (Mask[i] == -1)
8780 continue;
8781 NonUndef++;
8782 if (Mask[i] != i + BaseOffset) {
8783 if (OffElement == -1)
8784 OffElement = i;
8785 else
8786 return false;
8787 }
8788 }
8789 return NonUndef > 2 && OffElement != -1;
8790 };
8791 int OffElement;
8792 SDValue VInput;
8793 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8794 VInput = V1;
8795 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8796 VInput = V2;
8797 else
8798 return SDValue();
8799
8800 SDLoc dl(Op);
8801 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8802 ? MVT::i32
8803 : VT.getScalarType();
8804 SDValue Elt = DAG.getNode(
8805 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8806 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8807 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8808 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8809 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8810}
8811
8813 const ARMSubtarget *ST) {
8814 SDValue V1 = Op.getOperand(0);
8815 SDValue V2 = Op.getOperand(1);
8816 SDLoc dl(Op);
8817 EVT VT = Op.getValueType();
8819 unsigned EltSize = VT.getScalarSizeInBits();
8820
8821 if (ST->hasMVEIntegerOps() && EltSize == 1)
8822 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8823
8824 // Convert shuffles that are directly supported on NEON to target-specific
8825 // DAG nodes, instead of keeping them as shuffles and matching them again
8826 // during code selection. This is more efficient and avoids the possibility
8827 // of inconsistencies between legalization and selection.
8828 // FIXME: floating-point vectors should be canonicalized to integer vectors
8829 // of the same time so that they get CSEd properly.
8830 ArrayRef<int> ShuffleMask = SVN->getMask();
8831
8832 if (EltSize <= 32) {
8833 if (SVN->isSplat()) {
8834 int Lane = SVN->getSplatIndex();
8835 // If this is undef splat, generate it via "just" vdup, if possible.
8836 if (Lane == -1) Lane = 0;
8837
8838 // Test if V1 is a SCALAR_TO_VECTOR.
8839 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8840 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8841 }
8842 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8843 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8844 // reaches it).
8845 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8847 bool IsScalarToVector = true;
8848 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8849 if (!V1.getOperand(i).isUndef()) {
8850 IsScalarToVector = false;
8851 break;
8852 }
8853 if (IsScalarToVector)
8854 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8855 }
8856 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8857 DAG.getConstant(Lane, dl, MVT::i32));
8858 }
8859
8860 bool ReverseVEXT = false;
8861 unsigned Imm = 0;
8862 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8863 if (ReverseVEXT)
8864 std::swap(V1, V2);
8865 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8866 DAG.getConstant(Imm, dl, MVT::i32));
8867 }
8868
8869 if (isVREVMask(ShuffleMask, VT, 64))
8870 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8871 if (isVREVMask(ShuffleMask, VT, 32))
8872 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8873 if (isVREVMask(ShuffleMask, VT, 16))
8874 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8875
8876 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8877 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8878 DAG.getConstant(Imm, dl, MVT::i32));
8879 }
8880
8881 // Check for Neon shuffles that modify both input vectors in place.
8882 // If both results are used, i.e., if there are two shuffles with the same
8883 // source operands and with masks corresponding to both results of one of
8884 // these operations, DAG memoization will ensure that a single node is
8885 // used for both shuffles.
8886 unsigned WhichResult = 0;
8887 bool isV_UNDEF = false;
8888 if (ST->hasNEON()) {
8889 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8890 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8891 if (isV_UNDEF)
8892 V2 = V1;
8893 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8894 .getValue(WhichResult);
8895 }
8896 }
8897 if (ST->hasMVEIntegerOps()) {
8898 if (isVMOVNMask(ShuffleMask, VT, false, false))
8899 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8900 DAG.getConstant(0, dl, MVT::i32));
8901 if (isVMOVNMask(ShuffleMask, VT, true, false))
8902 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8903 DAG.getConstant(1, dl, MVT::i32));
8904 if (isVMOVNMask(ShuffleMask, VT, true, true))
8905 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8906 DAG.getConstant(1, dl, MVT::i32));
8907 }
8908
8909 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8910 // shuffles that produce a result larger than their operands with:
8911 // shuffle(concat(v1, undef), concat(v2, undef))
8912 // ->
8913 // shuffle(concat(v1, v2), undef)
8914 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8915 //
8916 // This is useful in the general case, but there are special cases where
8917 // native shuffles produce larger results: the two-result ops.
8918 //
8919 // Look through the concat when lowering them:
8920 // shuffle(concat(v1, v2), undef)
8921 // ->
8922 // concat(VZIP(v1, v2):0, :1)
8923 //
8924 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8925 SDValue SubV1 = V1->getOperand(0);
8926 SDValue SubV2 = V1->getOperand(1);
8927 EVT SubVT = SubV1.getValueType();
8928
8929 // We expect these to have been canonicalized to -1.
8930 assert(llvm::all_of(ShuffleMask, [&](int i) {
8931 return i < (int)VT.getVectorNumElements();
8932 }) && "Unexpected shuffle index into UNDEF operand!");
8933
8934 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8935 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8936 if (isV_UNDEF)
8937 SubV2 = SubV1;
8938 assert((WhichResult == 0) &&
8939 "In-place shuffle of concat can only have one result!");
8940 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8941 SubV1, SubV2);
8942 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8943 Res.getValue(1));
8944 }
8945 }
8946 }
8947
8948 if (ST->hasMVEIntegerOps() && EltSize <= 32) {
8949 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8950 return V;
8951
8952 for (bool Top : {false, true}) {
8953 for (bool SingleSource : {false, true}) {
8954 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8955 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8956 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8957 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8958 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8959 SingleSource ? V1 : V2);
8960 if (Top) {
8961 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8962 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8963 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8964 }
8965 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8966 }
8967 }
8968 }
8969 }
8970
8971 // If the shuffle is not directly supported and it has 4 elements, use
8972 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8973 unsigned NumElts = VT.getVectorNumElements();
8974 if (NumElts == 4) {
8975 unsigned PFIndexes[4];
8976 for (unsigned i = 0; i != 4; ++i) {
8977 if (ShuffleMask[i] < 0)
8978 PFIndexes[i] = 8;
8979 else
8980 PFIndexes[i] = ShuffleMask[i];
8981 }
8982
8983 // Compute the index in the perfect shuffle table.
8984 unsigned PFTableIndex =
8985 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8986 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8987 unsigned Cost = (PFEntry >> 30);
8988
8989 if (Cost <= 4) {
8990 if (ST->hasNEON())
8991 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8992 else if (isLegalMVEShuffleOp(PFEntry)) {
8993 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8994 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8995 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8996 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8997 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8998 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8999 }
9000 }
9001 }
9002
9003 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
9004 if (EltSize >= 32) {
9005 // Do the expansion with floating-point types, since that is what the VFP
9006 // registers are defined to use, and since i64 is not legal.
9007 EVT EltVT = EVT::getFloatingPointVT(EltSize);
9008 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
9009 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
9010 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
9012 for (unsigned i = 0; i < NumElts; ++i) {
9013 if (ShuffleMask[i] < 0)
9014 Ops.push_back(DAG.getUNDEF(EltVT));
9015 else
9016 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
9017 ShuffleMask[i] < (int)NumElts ? V1 : V2,
9018 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
9019 dl, MVT::i32)));
9020 }
9021 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
9022 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
9023 }
9024
9025 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
9026 isReverseMask(ShuffleMask, VT))
9027 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
9028
9029 if (ST->hasNEON() && VT == MVT::v8i8)
9030 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
9031 return NewOp;
9032
9033 if (ST->hasMVEIntegerOps())
9034 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
9035 return NewOp;
9036
9037 return SDValue();
9038}
9039
9041 const ARMSubtarget *ST) {
9042 EVT VecVT = Op.getOperand(0).getValueType();
9043 SDLoc dl(Op);
9044
9045 assert(ST->hasMVEIntegerOps() &&
9046 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
9047
9048 SDValue Conv =
9049 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
9050 unsigned Lane = Op.getConstantOperandVal(2);
9051 unsigned LaneWidth =
9053 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
9054 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
9055 Op.getOperand(1), DAG.getValueType(MVT::i1));
9056 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
9057 DAG.getConstant(~Mask, dl, MVT::i32));
9058 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
9059}
9060
9061SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
9062 SelectionDAG &DAG) const {
9063 // INSERT_VECTOR_ELT is legal only for immediate indexes.
9064 SDValue Lane = Op.getOperand(2);
9065 if (!isa<ConstantSDNode>(Lane))
9066 return SDValue();
9067
9068 SDValue Elt = Op.getOperand(1);
9069 EVT EltVT = Elt.getValueType();
9070
9071 if (Subtarget->hasMVEIntegerOps() &&
9072 Op.getValueType().getScalarSizeInBits() == 1)
9073 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
9074
9075 if (getTypeAction(*DAG.getContext(), EltVT) ==
9077 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
9078 // but the type system will try to do that if we don't intervene.
9079 // Reinterpret any such vector-element insertion as one with the
9080 // corresponding integer types.
9081
9082 SDLoc dl(Op);
9083
9084 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
9085 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
9087
9088 SDValue VecIn = Op.getOperand(0);
9089 EVT VecVT = VecIn.getValueType();
9090 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
9091 VecVT.getVectorNumElements());
9092
9093 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
9094 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
9095 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
9096 IVecIn, IElt, Lane);
9097 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
9098 }
9099
9100 return Op;
9101}
9102
9104 const ARMSubtarget *ST) {
9105 EVT VecVT = Op.getOperand(0).getValueType();
9106 SDLoc dl(Op);
9107
9108 assert(ST->hasMVEIntegerOps() &&
9109 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
9110
9111 SDValue Conv =
9112 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
9113 unsigned Lane = Op.getConstantOperandVal(1);
9114 unsigned LaneWidth =
9116 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
9117 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
9118 return Shift;
9119}
9120
9122 const ARMSubtarget *ST) {
9123 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
9124 SDValue Lane = Op.getOperand(1);
9125 if (!isa<ConstantSDNode>(Lane))
9126 return SDValue();
9127
9128 SDValue Vec = Op.getOperand(0);
9129 EVT VT = Vec.getValueType();
9130
9131 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9132 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
9133
9134 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
9135 SDLoc dl(Op);
9136 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
9137 }
9138
9139 return Op;
9140}
9141
9143 const ARMSubtarget *ST) {
9144 SDLoc dl(Op);
9145 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
9146 "Unexpected custom CONCAT_VECTORS lowering");
9148 "Unexpected custom CONCAT_VECTORS lowering");
9149 assert(ST->hasMVEIntegerOps() &&
9150 "CONCAT_VECTORS lowering only supported for MVE");
9151
9152 auto ConcatPair = [&](SDValue V1, SDValue V2) {
9153 EVT Op1VT = V1.getValueType();
9154 EVT Op2VT = V2.getValueType();
9155 assert(Op1VT == Op2VT && "Operand types don't match!");
9156 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
9157 "Unexpected i1 concat operations!");
9158 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
9159
9160 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9161 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
9162
9163 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
9164 // promoted to v8i16, etc.
9165 MVT ElType =
9167 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
9168
9169 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
9170 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
9171 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
9172 // ConcatVT.
9173 SDValue ConVec =
9174 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
9175 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9176 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9177 }
9178
9179 // Extract the vector elements from Op1 and Op2 one by one and truncate them
9180 // to be the right size for the destination. For example, if Op1 is v4i1
9181 // then the promoted vector is v4i32. The result of concatenation gives a
9182 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
9183 // needs truncating to i16 and inserting in the result.
9184 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
9185 EVT NewVT = NewV.getValueType();
9186 EVT ConcatVT = ConVec.getValueType();
9187 unsigned ExtScale = 1;
9188 if (NewVT == MVT::v2f64) {
9189 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
9190 ExtScale = 2;
9191 }
9192 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
9193 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
9194 DAG.getIntPtrConstant(i * ExtScale, dl));
9195 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
9196 DAG.getConstant(j, dl, MVT::i32));
9197 }
9198 return ConVec;
9199 };
9200 unsigned j = 0;
9201 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
9202 ConVec = ExtractInto(NewV1, ConVec, j);
9203 ConVec = ExtractInto(NewV2, ConVec, j);
9204
9205 // Now return the result of comparing the subvector with zero, which will
9206 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9207 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9208 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9209 };
9210
9211 // Concat each pair of subvectors and pack into the lower half of the array.
9212 SmallVector<SDValue> ConcatOps(Op->ops());
9213 while (ConcatOps.size() > 1) {
9214 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
9215 SDValue V1 = ConcatOps[I];
9216 SDValue V2 = ConcatOps[I + 1];
9217 ConcatOps[I / 2] = ConcatPair(V1, V2);
9218 }
9219 ConcatOps.resize(ConcatOps.size() / 2);
9220 }
9221 return ConcatOps[0];
9222}
9223
9225 const ARMSubtarget *ST) {
9226 EVT VT = Op->getValueType(0);
9227 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9228 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
9229
9230 // The only time a CONCAT_VECTORS operation can have legal types is when
9231 // two 64-bit vectors are concatenated to a 128-bit vector.
9232 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
9233 "unexpected CONCAT_VECTORS");
9234 SDLoc dl(Op);
9235 SDValue Val = DAG.getUNDEF(MVT::v2f64);
9236 SDValue Op0 = Op.getOperand(0);
9237 SDValue Op1 = Op.getOperand(1);
9238 if (!Op0.isUndef())
9239 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9240 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
9241 DAG.getIntPtrConstant(0, dl));
9242 if (!Op1.isUndef())
9243 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9244 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
9245 DAG.getIntPtrConstant(1, dl));
9246 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
9247}
9248
9250 const ARMSubtarget *ST) {
9251 SDValue V1 = Op.getOperand(0);
9252 SDValue V2 = Op.getOperand(1);
9253 SDLoc dl(Op);
9254 EVT VT = Op.getValueType();
9255 EVT Op1VT = V1.getValueType();
9256 unsigned NumElts = VT.getVectorNumElements();
9257 unsigned Index = V2->getAsZExtVal();
9258
9259 assert(VT.getScalarSizeInBits() == 1 &&
9260 "Unexpected custom EXTRACT_SUBVECTOR lowering");
9261 assert(ST->hasMVEIntegerOps() &&
9262 "EXTRACT_SUBVECTOR lowering only supported for MVE");
9263
9264 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9265
9266 // We now have Op1 promoted to a vector of integers, where v8i1 gets
9267 // promoted to v8i16, etc.
9268
9270
9271 if (NumElts == 2) {
9272 EVT SubVT = MVT::v4i32;
9273 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9274 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
9275 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9276 DAG.getIntPtrConstant(i, dl));
9277 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9278 DAG.getConstant(j, dl, MVT::i32));
9279 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9280 DAG.getConstant(j + 1, dl, MVT::i32));
9281 }
9282 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
9283 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9284 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
9285 }
9286
9287 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
9288 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9289 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
9290 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9291 DAG.getIntPtrConstant(i, dl));
9292 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9293 DAG.getConstant(j, dl, MVT::i32));
9294 }
9295
9296 // Now return the result of comparing the subvector with zero,
9297 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9298 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
9299 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9300}
9301
9302// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
9304 const ARMSubtarget *ST) {
9305 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
9306 EVT VT = N->getValueType(0);
9307 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9308 "Expected a vector i1 type!");
9309 SDValue Op = N->getOperand(0);
9310 EVT FromVT = Op.getValueType();
9311 SDLoc DL(N);
9312
9313 SDValue And =
9314 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
9315 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
9316 DAG.getCondCode(ISD::SETNE));
9317}
9318
9320 const ARMSubtarget *Subtarget) {
9321 if (!Subtarget->hasMVEIntegerOps())
9322 return SDValue();
9323
9324 EVT ToVT = N->getValueType(0);
9325 if (ToVT.getScalarType() == MVT::i1)
9326 return LowerTruncatei1(N, DAG, Subtarget);
9327
9328 // MVE does not have a single instruction to perform the truncation of a v4i32
9329 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9330 // Most of the instructions in MVE follow the 'Beats' system, where moving
9331 // values from different lanes is usually something that the instructions
9332 // avoid.
9333 //
9334 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9335 // which take a the top/bottom half of a larger lane and extend it (or do the
9336 // opposite, truncating into the top/bottom lane from a larger lane). Note
9337 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9338 // bottom 16bits from each vector lane. This works really well with T/B
9339 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9340 // to move order.
9341 //
9342 // But truncates and sext/zext are always going to be fairly common from llvm.
9343 // We have several options for how to deal with them:
9344 // - Wherever possible combine them into an instruction that makes them
9345 // "free". This includes loads/stores, which can perform the trunc as part
9346 // of the memory operation. Or certain shuffles that can be turned into
9347 // VMOVN/VMOVL.
9348 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9349 // trunc(mul(sext(a), sext(b))) may become
9350 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9351 // this case can use VMULL). This is performed in the
9352 // MVELaneInterleavingPass.
9353 // - Otherwise we have an option. By default we would expand the
9354 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9355 // registers. One for each vector lane in the vector. This can obviously be
9356 // very expensive.
9357 // - The other option is to use the fact that loads/store can extend/truncate
9358 // to turn a trunc into two truncating stack stores and a stack reload. This
9359 // becomes 3 back-to-back memory operations, but at least that is less than
9360 // all the insert/extracts.
9361 //
9362 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9363 // are either optimized where they can be, or eventually lowered into stack
9364 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9365 // two early, where other instructions would be better, and stops us from
9366 // having to reconstruct multiple buildvector shuffles into loads/stores.
9367 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9368 return SDValue();
9369 EVT FromVT = N->getOperand(0).getValueType();
9370 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9371 return SDValue();
9372
9373 SDValue Lo, Hi;
9374 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9375 SDLoc DL(N);
9376 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9377}
9378
9380 const ARMSubtarget *Subtarget) {
9381 if (!Subtarget->hasMVEIntegerOps())
9382 return SDValue();
9383
9384 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9385
9386 EVT ToVT = N->getValueType(0);
9387 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9388 return SDValue();
9389 SDValue Op = N->getOperand(0);
9390 EVT FromVT = Op.getValueType();
9391 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9392 return SDValue();
9393
9394 SDLoc DL(N);
9395 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9396 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9397 ExtVT = MVT::v8i16;
9398
9399 unsigned Opcode =
9401 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9402 SDValue Ext1 = Ext.getValue(1);
9403
9404 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9405 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9406 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9407 }
9408
9409 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9410}
9411
9412/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9413/// element has been zero/sign-extended, depending on the isSigned parameter,
9414/// from an integer type half its size.
9416 bool isSigned) {
9417 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9418 EVT VT = N->getValueType(0);
9419 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9420 SDNode *BVN = N->getOperand(0).getNode();
9421 if (BVN->getValueType(0) != MVT::v4i32 ||
9422 BVN->getOpcode() != ISD::BUILD_VECTOR)
9423 return false;
9424 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9425 unsigned HiElt = 1 - LoElt;
9430 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9431 return false;
9432 if (isSigned) {
9433 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9434 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9435 return true;
9436 } else {
9437 if (Hi0->isZero() && Hi1->isZero())
9438 return true;
9439 }
9440 return false;
9441 }
9442
9443 if (N->getOpcode() != ISD::BUILD_VECTOR)
9444 return false;
9445
9446 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9447 SDNode *Elt = N->getOperand(i).getNode();
9449 unsigned EltSize = VT.getScalarSizeInBits();
9450 unsigned HalfSize = EltSize / 2;
9451 if (isSigned) {
9452 if (!isIntN(HalfSize, C->getSExtValue()))
9453 return false;
9454 } else {
9455 if (!isUIntN(HalfSize, C->getZExtValue()))
9456 return false;
9457 }
9458 continue;
9459 }
9460 return false;
9461 }
9462
9463 return true;
9464}
9465
9466/// isSignExtended - Check if a node is a vector value that is sign-extended
9467/// or a constant BUILD_VECTOR with sign-extended elements.
9469 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9470 return true;
9471 if (isExtendedBUILD_VECTOR(N, DAG, true))
9472 return true;
9473 return false;
9474}
9475
9476/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9477/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9479 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9481 return true;
9482 if (isExtendedBUILD_VECTOR(N, DAG, false))
9483 return true;
9484 return false;
9485}
9486
9487static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9488 if (OrigVT.getSizeInBits() >= 64)
9489 return OrigVT;
9490
9491 assert(OrigVT.isSimple() && "Expecting a simple value type");
9492
9493 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9494 switch (OrigSimpleTy) {
9495 default: llvm_unreachable("Unexpected Vector Type");
9496 case MVT::v2i8:
9497 case MVT::v2i16:
9498 return MVT::v2i32;
9499 case MVT::v4i8:
9500 return MVT::v4i16;
9501 }
9502}
9503
9504/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9505/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9506/// We insert the required extension here to get the vector to fill a D register.
9508 const EVT &OrigTy,
9509 const EVT &ExtTy,
9510 unsigned ExtOpcode) {
9511 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9512 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9513 // 64-bits we need to insert a new extension so that it will be 64-bits.
9514 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9515 if (OrigTy.getSizeInBits() >= 64)
9516 return N;
9517
9518 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9519 EVT NewVT = getExtensionTo64Bits(OrigTy);
9520
9521 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9522}
9523
9524/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9525/// does not do any sign/zero extension. If the original vector is less
9526/// than 64 bits, an appropriate extension will be added after the load to
9527/// reach a total size of 64 bits. We have to add the extension separately
9528/// because ARM does not have a sign/zero extending load for vectors.
9530 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9531
9532 // The load already has the right type.
9533 if (ExtendedTy == LD->getMemoryVT())
9534 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9535 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9536 LD->getMemOperand()->getFlags());
9537
9538 // We need to create a zextload/sextload. We cannot just create a load
9539 // followed by a zext/zext node because LowerMUL is also run during normal
9540 // operation legalization where we can't create illegal types.
9541 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9542 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9543 LD->getMemoryVT(), LD->getAlign(),
9544 LD->getMemOperand()->getFlags());
9545}
9546
9547/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9548/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9549/// the unextended value. The unextended vector should be 64 bits so that it can
9550/// be used as an operand to a VMULL instruction. If the original vector size
9551/// before extension is less than 64 bits we add a an extension to resize
9552/// the vector to 64 bits.
9554 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9555 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9556 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9557 N->getOperand(0)->getValueType(0),
9558 N->getValueType(0),
9559 N->getOpcode());
9560
9561 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9562 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9563 "Expected extending load");
9564
9565 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9566 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9567 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9568 SDValue extLoad =
9569 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9570 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9571
9572 return newLoad;
9573 }
9574
9575 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9576 // have been legalized as a BITCAST from v4i32.
9577 if (N->getOpcode() == ISD::BITCAST) {
9578 SDNode *BVN = N->getOperand(0).getNode();
9580 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9581 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9582 return DAG.getBuildVector(
9583 MVT::v2i32, SDLoc(N),
9584 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9585 }
9586 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9587 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9588 EVT VT = N->getValueType(0);
9589 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9590 unsigned NumElts = VT.getVectorNumElements();
9591 MVT TruncVT = MVT::getIntegerVT(EltSize);
9593 SDLoc dl(N);
9594 for (unsigned i = 0; i != NumElts; ++i) {
9595 const APInt &CInt = N->getConstantOperandAPInt(i);
9596 // Element types smaller than 32 bits are not legal, so use i32 elements.
9597 // The values are implicitly truncated so sext vs. zext doesn't matter.
9598 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9599 }
9600 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9601}
9602
9603static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9604 unsigned Opcode = N->getOpcode();
9605 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9606 SDNode *N0 = N->getOperand(0).getNode();
9607 SDNode *N1 = N->getOperand(1).getNode();
9608 return N0->hasOneUse() && N1->hasOneUse() &&
9609 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9610 }
9611 return false;
9612}
9613
9614static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9615 unsigned Opcode = N->getOpcode();
9616 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9617 SDNode *N0 = N->getOperand(0).getNode();
9618 SDNode *N1 = N->getOperand(1).getNode();
9619 return N0->hasOneUse() && N1->hasOneUse() &&
9620 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9621 }
9622 return false;
9623}
9624
9626 // Multiplications are only custom-lowered for 128-bit vectors so that
9627 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9628 EVT VT = Op.getValueType();
9629 assert(VT.is128BitVector() && VT.isInteger() &&
9630 "unexpected type for custom-lowering ISD::MUL");
9631 SDNode *N0 = Op.getOperand(0).getNode();
9632 SDNode *N1 = Op.getOperand(1).getNode();
9633 unsigned NewOpc = 0;
9634 bool isMLA = false;
9635 bool isN0SExt = isSignExtended(N0, DAG);
9636 bool isN1SExt = isSignExtended(N1, DAG);
9637 if (isN0SExt && isN1SExt)
9638 NewOpc = ARMISD::VMULLs;
9639 else {
9640 bool isN0ZExt = isZeroExtended(N0, DAG);
9641 bool isN1ZExt = isZeroExtended(N1, DAG);
9642 if (isN0ZExt && isN1ZExt)
9643 NewOpc = ARMISD::VMULLu;
9644 else if (isN1SExt || isN1ZExt) {
9645 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9646 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9647 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9648 NewOpc = ARMISD::VMULLs;
9649 isMLA = true;
9650 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9651 NewOpc = ARMISD::VMULLu;
9652 isMLA = true;
9653 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9654 std::swap(N0, N1);
9655 NewOpc = ARMISD::VMULLu;
9656 isMLA = true;
9657 }
9658 }
9659
9660 if (!NewOpc) {
9661 if (VT == MVT::v2i64)
9662 // Fall through to expand this. It is not legal.
9663 return SDValue();
9664 else
9665 // Other vector multiplications are legal.
9666 return Op;
9667 }
9668 }
9669
9670 // Legalize to a VMULL instruction.
9671 SDLoc DL(Op);
9672 SDValue Op0;
9673 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9674 if (!isMLA) {
9675 Op0 = SkipExtensionForVMULL(N0, DAG);
9677 Op1.getValueType().is64BitVector() &&
9678 "unexpected types for extended operands to VMULL");
9679 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9680 }
9681
9682 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9683 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9684 // vmull q0, d4, d6
9685 // vmlal q0, d5, d6
9686 // is faster than
9687 // vaddl q0, d4, d5
9688 // vmovl q1, d6
9689 // vmul q0, q0, q1
9690 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9691 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9692 EVT Op1VT = Op1.getValueType();
9693 return DAG.getNode(N0->getOpcode(), DL, VT,
9694 DAG.getNode(NewOpc, DL, VT,
9695 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9696 DAG.getNode(NewOpc, DL, VT,
9697 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9698}
9699
9701 SelectionDAG &DAG) {
9702 // TODO: Should this propagate fast-math-flags?
9703
9704 // Convert to float
9705 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9706 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9707 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9708 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9709 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9710 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9711 // Get reciprocal estimate.
9712 // float4 recip = vrecpeq_f32(yf);
9713 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9714 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9715 Y);
9716 // Because char has a smaller range than uchar, we can actually get away
9717 // without any newton steps. This requires that we use a weird bias
9718 // of 0xb000, however (again, this has been exhaustively tested).
9719 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9720 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9721 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9722 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9723 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9724 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9725 // Convert back to short.
9726 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9727 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9728 return X;
9729}
9730
9732 SelectionDAG &DAG) {
9733 // TODO: Should this propagate fast-math-flags?
9734
9735 SDValue N2;
9736 // Convert to float.
9737 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9738 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9739 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9740 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9741 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9742 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9743
9744 // Use reciprocal estimate and one refinement step.
9745 // float4 recip = vrecpeq_f32(yf);
9746 // recip *= vrecpsq_f32(yf, recip);
9747 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9748 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9749 N1);
9750 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9751 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9752 N1, N2);
9753 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9754 // Because short has a smaller range than ushort, we can actually get away
9755 // with only a single newton step. This requires that we use a weird bias
9756 // of 89, however (again, this has been exhaustively tested).
9757 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9758 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9759 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9760 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9761 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9762 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9763 // Convert back to integer and return.
9764 // return vmovn_s32(vcvt_s32_f32(result));
9765 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9766 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9767 return N0;
9768}
9769
9771 const ARMSubtarget *ST) {
9772 EVT VT = Op.getValueType();
9773 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9774 "unexpected type for custom-lowering ISD::SDIV");
9775
9776 SDLoc dl(Op);
9777 SDValue N0 = Op.getOperand(0);
9778 SDValue N1 = Op.getOperand(1);
9779 SDValue N2, N3;
9780
9781 if (VT == MVT::v8i8) {
9782 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9783 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9784
9785 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9786 DAG.getIntPtrConstant(4, dl));
9787 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9788 DAG.getIntPtrConstant(4, dl));
9789 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9790 DAG.getIntPtrConstant(0, dl));
9791 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9792 DAG.getIntPtrConstant(0, dl));
9793
9794 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9795 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9796
9797 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9798 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9799
9800 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9801 return N0;
9802 }
9803 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9804}
9805
9807 const ARMSubtarget *ST) {
9808 // TODO: Should this propagate fast-math-flags?
9809 EVT VT = Op.getValueType();
9810 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9811 "unexpected type for custom-lowering ISD::UDIV");
9812
9813 SDLoc dl(Op);
9814 SDValue N0 = Op.getOperand(0);
9815 SDValue N1 = Op.getOperand(1);
9816 SDValue N2, N3;
9817
9818 if (VT == MVT::v8i8) {
9819 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9820 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9821
9822 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9823 DAG.getIntPtrConstant(4, dl));
9824 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9825 DAG.getIntPtrConstant(4, dl));
9826 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9827 DAG.getIntPtrConstant(0, dl));
9828 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9829 DAG.getIntPtrConstant(0, dl));
9830
9831 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9832 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9833
9834 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9835 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9836
9837 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9838 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9839 MVT::i32),
9840 N0);
9841 return N0;
9842 }
9843
9844 // v4i16 sdiv ... Convert to float.
9845 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9846 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9847 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9848 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9849 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9850 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9851
9852 // Use reciprocal estimate and two refinement steps.
9853 // float4 recip = vrecpeq_f32(yf);
9854 // recip *= vrecpsq_f32(yf, recip);
9855 // recip *= vrecpsq_f32(yf, recip);
9856 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9857 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9858 BN1);
9859 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9860 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9861 BN1, N2);
9862 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9863 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9864 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9865 BN1, N2);
9866 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9867 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9868 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9869 // and that it will never cause us to return an answer too large).
9870 // float4 result = as_float4(as_int4(xf*recip) + 2);
9871 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9872 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9873 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9874 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9875 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9876 // Convert back to integer and return.
9877 // return vmovn_u32(vcvt_s32_f32(result));
9878 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9879 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9880 return N0;
9881}
9882
9884 SDNode *N = Op.getNode();
9885 EVT VT = N->getValueType(0);
9886 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9887
9888 SDValue Carry = Op.getOperand(2);
9889
9890 SDLoc DL(Op);
9891
9892 SDValue Result;
9893 if (Op.getOpcode() == ISD::UADDO_CARRY) {
9894 // This converts the boolean value carry into the carry flag.
9895 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9896
9897 // Do the addition proper using the carry flag we wanted.
9898 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9899 Op.getOperand(1), Carry);
9900
9901 // Now convert the carry flag into a boolean value.
9902 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9903 } else {
9904 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
9905 // have to invert the carry first.
9906 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9907 DAG.getConstant(1, DL, MVT::i32), Carry);
9908 // This converts the boolean value carry into the carry flag.
9909 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9910
9911 // Do the subtraction proper using the carry flag we wanted.
9912 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9913 Op.getOperand(1), Carry);
9914
9915 // Now convert the carry flag into a boolean value.
9916 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9917 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9918 // by ISD::USUBO_CARRY, so compute 1 - C.
9919 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9920 DAG.getConstant(1, DL, MVT::i32), Carry);
9921 }
9922
9923 // Return both values.
9924 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9925}
9926
9927SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
9928 assert(Subtarget->isTargetDarwin());
9929
9930 // For iOS, we want to call an alternative entry point: __sincos_stret,
9931 // return values are passed via sret.
9932 SDLoc dl(Op);
9933 SDValue Arg = Op.getOperand(0);
9934 EVT ArgVT = Arg.getValueType();
9935 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9936 auto PtrVT = getPointerTy(DAG.getDataLayout());
9937
9939 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9940
9941 // Pair of floats / doubles used to pass the result.
9942 Type *RetTy = StructType::get(ArgTy, ArgTy);
9943 auto &DL = DAG.getDataLayout();
9944
9945 ArgListTy Args;
9946 bool ShouldUseSRet = Subtarget->isAPCS_ABI();
9947 SDValue SRet;
9948 if (ShouldUseSRet) {
9949 // Create stack object for sret.
9950 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
9951 const Align StackAlign = DL.getPrefTypeAlign(RetTy);
9952 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
9953 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
9954
9955 ArgListEntry Entry;
9956 Entry.Node = SRet;
9957 Entry.Ty = PointerType::getUnqual(RetTy->getContext());
9958 Entry.IsSExt = false;
9959 Entry.IsZExt = false;
9960 Entry.IsSRet = true;
9961 Args.push_back(Entry);
9962 RetTy = Type::getVoidTy(*DAG.getContext());
9963 }
9964
9965 ArgListEntry Entry;
9966 Entry.Node = Arg;
9967 Entry.Ty = ArgTy;
9968 Entry.IsSExt = false;
9969 Entry.IsZExt = false;
9970 Args.push_back(Entry);
9971
9972 RTLIB::Libcall LC =
9973 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
9974 const char *LibcallName = getLibcallName(LC);
9975 CallingConv::ID CC = getLibcallCallingConv(LC);
9976 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
9977
9979 CLI.setDebugLoc(dl)
9980 .setChain(DAG.getEntryNode())
9981 .setCallee(CC, RetTy, Callee, std::move(Args))
9982 .setDiscardResult(ShouldUseSRet);
9983 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
9984
9985 if (!ShouldUseSRet)
9986 return CallResult.first;
9987
9988 SDValue LoadSin =
9989 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
9990
9991 // Address of cos field.
9992 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
9993 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
9994 SDValue LoadCos =
9995 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
9996
9997 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
9998 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
9999 LoadSin.getValue(0), LoadCos.getValue(0));
10000}
10001
10002SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
10003 bool Signed,
10004 SDValue &Chain) const {
10005 EVT VT = Op.getValueType();
10006 assert((VT == MVT::i32 || VT == MVT::i64) &&
10007 "unexpected type for custom lowering DIV");
10008 SDLoc dl(Op);
10009
10010 const auto &DL = DAG.getDataLayout();
10011 const auto &TLI = DAG.getTargetLoweringInfo();
10012
10013 const char *Name = nullptr;
10014 if (Signed)
10015 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
10016 else
10017 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
10018
10020
10022
10023 for (auto AI : {1, 0}) {
10024 ArgListEntry Arg;
10025 Arg.Node = Op.getOperand(AI);
10026 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
10027 Args.push_back(Arg);
10028 }
10029
10030 CallLoweringInfo CLI(DAG);
10031 CLI.setDebugLoc(dl)
10032 .setChain(Chain)
10034 ES, std::move(Args));
10035
10036 return LowerCallTo(CLI).first;
10037}
10038
10039// This is a code size optimisation: return the original SDIV node to
10040// DAGCombiner when we don't want to expand SDIV into a sequence of
10041// instructions, and an empty node otherwise which will cause the
10042// SDIV to be expanded in DAGCombine.
10043SDValue
10044ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
10045 SelectionDAG &DAG,
10046 SmallVectorImpl<SDNode *> &Created) const {
10047 // TODO: Support SREM
10048 if (N->getOpcode() != ISD::SDIV)
10049 return SDValue();
10050
10051 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
10052 const bool MinSize = ST.hasMinSize();
10053 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
10054 : ST.hasDivideInARMMode();
10055
10056 // Don't touch vector types; rewriting this may lead to scalarizing
10057 // the int divs.
10058 if (N->getOperand(0).getValueType().isVector())
10059 return SDValue();
10060
10061 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
10062 // hwdiv support for this to be really profitable.
10063 if (!(MinSize && HasDivide))
10064 return SDValue();
10065
10066 // ARM mode is a bit simpler than Thumb: we can handle large power
10067 // of 2 immediates with 1 mov instruction; no further checks required,
10068 // just return the sdiv node.
10069 if (!ST.isThumb())
10070 return SDValue(N, 0);
10071
10072 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
10073 // and thus lose the code size benefits of a MOVS that requires only 2.
10074 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
10075 // but as it's doing exactly this, it's not worth the trouble to get TTI.
10076 if (Divisor.sgt(128))
10077 return SDValue();
10078
10079 return SDValue(N, 0);
10080}
10081
10082SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
10083 bool Signed) const {
10084 assert(Op.getValueType() == MVT::i32 &&
10085 "unexpected type for custom lowering DIV");
10086 SDLoc dl(Op);
10087
10088 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
10089 DAG.getEntryNode(), Op.getOperand(1));
10090
10091 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
10092}
10093
10095 SDLoc DL(N);
10096 SDValue Op = N->getOperand(1);
10097 if (N->getValueType(0) == MVT::i32)
10098 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
10099 SDValue Lo, Hi;
10100 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
10101 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
10102 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
10103}
10104
10105void ARMTargetLowering::ExpandDIV_Windows(
10106 SDValue Op, SelectionDAG &DAG, bool Signed,
10108 const auto &DL = DAG.getDataLayout();
10109 const auto &TLI = DAG.getTargetLoweringInfo();
10110
10111 assert(Op.getValueType() == MVT::i64 &&
10112 "unexpected type for custom lowering DIV");
10113 SDLoc dl(Op);
10114
10115 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
10116
10117 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
10118
10119 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
10120 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
10121 DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
10122 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
10123
10124 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
10125}
10126
10128 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
10129 EVT MemVT = LD->getMemoryVT();
10130 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10131 MemVT == MVT::v16i1) &&
10132 "Expected a predicate type!");
10133 assert(MemVT == Op.getValueType());
10134 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
10135 "Expected a non-extending load");
10136 assert(LD->isUnindexed() && "Expected a unindexed load");
10137
10138 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
10139 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
10140 // need to make sure that 8/4/2 bits are actually loaded into the correct
10141 // place, which means loading the value and then shuffling the values into
10142 // the bottom bits of the predicate.
10143 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
10144 // for BE).
10145 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
10146 // a natural VMSR(load), so needs to be reversed.
10147
10148 SDLoc dl(Op);
10149 SDValue Load = DAG.getExtLoad(
10150 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
10152 LD->getMemOperand());
10153 SDValue Val = Load;
10154 if (DAG.getDataLayout().isBigEndian())
10155 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
10156 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
10157 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
10158 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
10159 if (MemVT != MVT::v16i1)
10160 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
10161 DAG.getConstant(0, dl, MVT::i32));
10162 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
10163}
10164
10165void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
10166 SelectionDAG &DAG) const {
10168 EVT MemVT = LD->getMemoryVT();
10169 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
10170
10171 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10172 !Subtarget->isThumb1Only() && LD->isVolatile() &&
10173 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10174 SDLoc dl(N);
10176 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
10177 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
10178 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
10179 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
10180 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
10181 Results.append({Pair, Result.getValue(2)});
10182 }
10183}
10184
10186 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10187 EVT MemVT = ST->getMemoryVT();
10188 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10189 MemVT == MVT::v16i1) &&
10190 "Expected a predicate type!");
10191 assert(MemVT == ST->getValue().getValueType());
10192 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
10193 assert(ST->isUnindexed() && "Expected a unindexed store");
10194
10195 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
10196 // top bits unset and a scalar store.
10197 SDLoc dl(Op);
10198 SDValue Build = ST->getValue();
10199 if (MemVT != MVT::v16i1) {
10201 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
10202 unsigned Elt = DAG.getDataLayout().isBigEndian()
10203 ? MemVT.getVectorNumElements() - I - 1
10204 : I;
10205 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
10206 DAG.getConstant(Elt, dl, MVT::i32)));
10207 }
10208 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
10209 Ops.push_back(DAG.getUNDEF(MVT::i32));
10210 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
10211 }
10212 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
10213 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
10214 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
10215 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
10216 DAG.getConstant(16, dl, MVT::i32));
10217 return DAG.getTruncStore(
10218 ST->getChain(), dl, GRP, ST->getBasePtr(),
10220 ST->getMemOperand());
10221}
10222
10224 const ARMSubtarget *Subtarget) {
10225 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10226 EVT MemVT = ST->getMemoryVT();
10227 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
10228
10229 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10230 !Subtarget->isThumb1Only() && ST->isVolatile() &&
10231 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10232 SDNode *N = Op.getNode();
10233 SDLoc dl(N);
10234
10235 SDValue Lo = DAG.getNode(
10236 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10237 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
10238 MVT::i32));
10239 SDValue Hi = DAG.getNode(
10240 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10241 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
10242 MVT::i32));
10243
10244 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
10245 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
10246 MemVT, ST->getMemOperand());
10247 } else if (Subtarget->hasMVEIntegerOps() &&
10248 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10249 MemVT == MVT::v16i1))) {
10250 return LowerPredicateStore(Op, DAG);
10251 }
10252
10253 return SDValue();
10254}
10255
10256static bool isZeroVector(SDValue N) {
10257 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
10258 (N->getOpcode() == ARMISD::VMOVIMM &&
10259 isNullConstant(N->getOperand(0))));
10260}
10261
10264 MVT VT = Op.getSimpleValueType();
10265 SDValue Mask = N->getMask();
10266 SDValue PassThru = N->getPassThru();
10267 SDLoc dl(Op);
10268
10269 if (isZeroVector(PassThru))
10270 return Op;
10271
10272 // MVE Masked loads use zero as the passthru value. Here we convert undef to
10273 // zero too, and other values are lowered to a select.
10274 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
10275 DAG.getTargetConstant(0, dl, MVT::i32));
10276 SDValue NewLoad = DAG.getMaskedLoad(
10277 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
10278 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
10279 N->getExtensionType(), N->isExpandingLoad());
10280 SDValue Combo = NewLoad;
10281 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
10282 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
10283 isZeroVector(PassThru->getOperand(0));
10284 if (!PassThru.isUndef() && !PassThruIsCastZero)
10285 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
10286 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
10287}
10288
10290 const ARMSubtarget *ST) {
10291 if (!ST->hasMVEIntegerOps())
10292 return SDValue();
10293
10294 SDLoc dl(Op);
10295 unsigned BaseOpcode = 0;
10296 switch (Op->getOpcode()) {
10297 default: llvm_unreachable("Expected VECREDUCE opcode");
10298 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
10299 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
10300 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
10301 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
10302 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
10303 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
10304 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
10305 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
10306 }
10307
10308 SDValue Op0 = Op->getOperand(0);
10309 EVT VT = Op0.getValueType();
10310 EVT EltVT = VT.getVectorElementType();
10311 unsigned NumElts = VT.getVectorNumElements();
10312 unsigned NumActiveLanes = NumElts;
10313
10314 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10315 NumActiveLanes == 2) &&
10316 "Only expected a power 2 vector size");
10317
10318 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
10319 // allows us to easily extract vector elements from the lanes.
10320 while (NumActiveLanes > 4) {
10321 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
10322 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
10323 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
10324 NumActiveLanes /= 2;
10325 }
10326
10327 SDValue Res;
10328 if (NumActiveLanes == 4) {
10329 // The remaining 4 elements are summed sequentially
10330 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10331 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
10332 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10333 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
10334 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10335 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
10336 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10337 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
10338 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10339 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
10340 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
10341 } else {
10342 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10343 DAG.getConstant(0, dl, MVT::i32));
10344 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10345 DAG.getConstant(1, dl, MVT::i32));
10346 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10347 }
10348
10349 // Result type may be wider than element type.
10350 if (EltVT != Op->getValueType(0))
10351 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
10352 return Res;
10353}
10354
10356 const ARMSubtarget *ST) {
10357 if (!ST->hasMVEFloatOps())
10358 return SDValue();
10359 return LowerVecReduce(Op, DAG, ST);
10360}
10361
10363 const ARMSubtarget *ST) {
10364 if (!ST->hasNEON())
10365 return SDValue();
10366
10367 SDLoc dl(Op);
10368 SDValue Op0 = Op->getOperand(0);
10369 EVT VT = Op0.getValueType();
10370 EVT EltVT = VT.getVectorElementType();
10371
10372 unsigned PairwiseIntrinsic = 0;
10373 switch (Op->getOpcode()) {
10374 default:
10375 llvm_unreachable("Expected VECREDUCE opcode");
10376 case ISD::VECREDUCE_UMIN:
10377 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10378 break;
10379 case ISD::VECREDUCE_UMAX:
10380 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10381 break;
10382 case ISD::VECREDUCE_SMIN:
10383 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10384 break;
10385 case ISD::VECREDUCE_SMAX:
10386 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10387 break;
10388 }
10389 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10390
10391 unsigned NumElts = VT.getVectorNumElements();
10392 unsigned NumActiveLanes = NumElts;
10393
10394 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10395 NumActiveLanes == 2) &&
10396 "Only expected a power 2 vector size");
10397
10398 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10399 if (VT.is128BitVector()) {
10400 SDValue Lo, Hi;
10401 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10402 VT = Lo.getValueType();
10403 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10404 NumActiveLanes /= 2;
10405 }
10406
10407 // Use pairwise reductions until one lane remains
10408 while (NumActiveLanes > 1) {
10409 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10410 NumActiveLanes /= 2;
10411 }
10412
10413 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10414 DAG.getConstant(0, dl, MVT::i32));
10415
10416 // Result type may be wider than element type.
10417 if (EltVT != Op.getValueType()) {
10418 unsigned Extend = 0;
10419 switch (Op->getOpcode()) {
10420 default:
10421 llvm_unreachable("Expected VECREDUCE opcode");
10422 case ISD::VECREDUCE_UMIN:
10423 case ISD::VECREDUCE_UMAX:
10424 Extend = ISD::ZERO_EXTEND;
10425 break;
10426 case ISD::VECREDUCE_SMIN:
10427 case ISD::VECREDUCE_SMAX:
10428 Extend = ISD::SIGN_EXTEND;
10429 break;
10430 }
10431 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10432 }
10433 return Res;
10434}
10435
10437 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10438 // Acquire/Release load/store is not legal for targets without a dmb or
10439 // equivalent available.
10440 return SDValue();
10441
10442 // Monotonic load/store is legal for all targets.
10443 return Op;
10444}
10445
10448 SelectionDAG &DAG,
10449 const ARMSubtarget *Subtarget) {
10450 SDLoc DL(N);
10451 // Under Power Management extensions, the cycle-count is:
10452 // mrc p15, #0, <Rt>, c9, c13, #0
10453 SDValue Ops[] = { N->getOperand(0), // Chain
10454 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10455 DAG.getTargetConstant(15, DL, MVT::i32),
10456 DAG.getTargetConstant(0, DL, MVT::i32),
10457 DAG.getTargetConstant(9, DL, MVT::i32),
10458 DAG.getTargetConstant(13, DL, MVT::i32),
10459 DAG.getTargetConstant(0, DL, MVT::i32)
10460 };
10461
10462 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10463 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10464 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10465 DAG.getConstant(0, DL, MVT::i32)));
10466 Results.push_back(Cycles32.getValue(1));
10467}
10468
10470 SDValue V1) {
10471 SDLoc dl(V0.getNode());
10472 SDValue RegClass =
10473 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10474 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10475 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10476 const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
10477 return SDValue(
10478 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10479}
10480
10482 SDLoc dl(V.getNode());
10483 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10484 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10485 if (isBigEndian)
10486 std::swap(VLo, VHi);
10487 return createGPRPairNode2xi32(DAG, VLo, VHi);
10488}
10489
10492 SelectionDAG &DAG) {
10493 assert(N->getValueType(0) == MVT::i64 &&
10494 "AtomicCmpSwap on types less than 64 should be legal");
10495 SDValue Ops[] = {
10496 createGPRPairNode2xi32(DAG, N->getOperand(1),
10497 DAG.getUNDEF(MVT::i32)), // pointer, temp
10498 createGPRPairNodei64(DAG, N->getOperand(2)), // expected
10499 createGPRPairNodei64(DAG, N->getOperand(3)), // new
10500 N->getOperand(0), // chain in
10501 };
10502 SDNode *CmpSwap = DAG.getMachineNode(
10503 ARM::CMP_SWAP_64, SDLoc(N),
10504 DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops);
10505
10506 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10507 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10508
10509 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10510
10511 SDValue Lo =
10512 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10513 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10514 SDValue Hi =
10515 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10516 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10517 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10518 Results.push_back(SDValue(CmpSwap, 2));
10519}
10520
10521SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10522 SDLoc dl(Op);
10523 EVT VT = Op.getValueType();
10524 SDValue Chain = Op.getOperand(0);
10525 SDValue LHS = Op.getOperand(1);
10526 SDValue RHS = Op.getOperand(2);
10527 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10528 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10529
10530 // If we don't have instructions of this float type then soften to a libcall
10531 // and use SETCC instead.
10532 if (isUnsupportedFloatingType(LHS.getValueType())) {
10534 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling);
10535 if (!RHS.getNode()) {
10536 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10537 CC = ISD::SETNE;
10538 }
10539 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10540 DAG.getCondCode(CC));
10541 return DAG.getMergeValues({Result, Chain}, dl);
10542 }
10543
10544 ARMCC::CondCodes CondCode, CondCode2;
10545 FPCCToARMCC(CC, CondCode, CondCode2);
10546
10547 // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit
10548 // in CMPFP and CMPFPE, but instead it should be made explicit by these
10549 // instructions using a chain instead of glue. This would also fix the problem
10550 // here (and also in LowerSELECT_CC) where we generate two comparisons when
10551 // CondCode2 != AL.
10552 SDValue True = DAG.getConstant(1, dl, VT);
10553 SDValue False = DAG.getConstant(0, dl, VT);
10554 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10555 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
10556 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10557 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG);
10558 if (CondCode2 != ARMCC::AL) {
10559 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10560 Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10561 Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG);
10562 }
10563 return DAG.getMergeValues({Result, Chain}, dl);
10564}
10565
10566SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10568
10569 EVT VT = getPointerTy(DAG.getDataLayout());
10570 SDLoc DL(Op);
10571 int FI = MFI.CreateFixedObject(4, 0, false);
10572 return DAG.getFrameIndex(FI, VT);
10573}
10574
10576 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10577 switch (Op.getOpcode()) {
10578 default: llvm_unreachable("Don't know how to custom lower this!");
10579 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10580 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10581 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10582 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10583 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10584 case ISD::SELECT: return LowerSELECT(Op, DAG);
10585 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10586 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10587 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10588 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10589 case ISD::VASTART: return LowerVASTART(Op, DAG);
10590 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10591 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10592 case ISD::SINT_TO_FP:
10593 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10596 case ISD::FP_TO_SINT:
10597 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10599 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10600 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10601 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10602 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10603 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10604 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10605 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10606 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10607 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10608 Subtarget);
10609 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10610 case ISD::SHL:
10611 case ISD::SRL:
10612 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10613 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10614 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10615 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10616 case ISD::SRL_PARTS:
10617 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10618 case ISD::CTTZ:
10619 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10620 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10621 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10622 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10623 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10624 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10625 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10626 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10627 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10628 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10629 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10630 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10631 case ISD::SIGN_EXTEND:
10632 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10633 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10634 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10635 case ISD::SET_FPMODE:
10636 return LowerSET_FPMODE(Op, DAG);
10637 case ISD::RESET_FPMODE:
10638 return LowerRESET_FPMODE(Op, DAG);
10639 case ISD::MUL: return LowerMUL(Op, DAG);
10640 case ISD::SDIV:
10641 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10642 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10643 return LowerSDIV(Op, DAG, Subtarget);
10644 case ISD::UDIV:
10645 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10646 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10647 return LowerUDIV(Op, DAG, Subtarget);
10648 case ISD::UADDO_CARRY:
10649 case ISD::USUBO_CARRY:
10650 return LowerUADDSUBO_CARRY(Op, DAG);
10651 case ISD::SADDO:
10652 case ISD::SSUBO:
10653 return LowerSignedALUO(Op, DAG);
10654 case ISD::UADDO:
10655 case ISD::USUBO:
10656 return LowerUnsignedALUO(Op, DAG);
10657 case ISD::SADDSAT:
10658 case ISD::SSUBSAT:
10659 case ISD::UADDSAT:
10660 case ISD::USUBSAT:
10661 return LowerADDSUBSAT(Op, DAG, Subtarget);
10662 case ISD::LOAD:
10663 return LowerPredicateLoad(Op, DAG);
10664 case ISD::STORE:
10665 return LowerSTORE(Op, DAG, Subtarget);
10666 case ISD::MLOAD:
10667 return LowerMLOAD(Op, DAG);
10668 case ISD::VECREDUCE_MUL:
10669 case ISD::VECREDUCE_AND:
10670 case ISD::VECREDUCE_OR:
10671 case ISD::VECREDUCE_XOR:
10672 return LowerVecReduce(Op, DAG, Subtarget);
10673 case ISD::VECREDUCE_FADD:
10674 case ISD::VECREDUCE_FMUL:
10675 case ISD::VECREDUCE_FMIN:
10676 case ISD::VECREDUCE_FMAX:
10677 return LowerVecReduceF(Op, DAG, Subtarget);
10678 case ISD::VECREDUCE_UMIN:
10679 case ISD::VECREDUCE_UMAX:
10680 case ISD::VECREDUCE_SMIN:
10681 case ISD::VECREDUCE_SMAX:
10682 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10683 case ISD::ATOMIC_LOAD:
10684 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
10685 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
10686 case ISD::SDIVREM:
10687 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10688 case ISD::DYNAMIC_STACKALLOC:
10689 if (Subtarget->isTargetWindows())
10690 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10691 llvm_unreachable("Don't know how to custom lower this!");
10693 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10695 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10696 case ISD::STRICT_FSETCC:
10697 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10698 case ISD::SPONENTRY:
10699 return LowerSPONENTRY(Op, DAG);
10700 case ARMISD::WIN__DBZCHK: return SDValue();
10701 }
10702}
10703
10705 SelectionDAG &DAG) {
10706 unsigned IntNo = N->getConstantOperandVal(0);
10707 unsigned Opc = 0;
10708 if (IntNo == Intrinsic::arm_smlald)
10709 Opc = ARMISD::SMLALD;
10710 else if (IntNo == Intrinsic::arm_smlaldx)
10711 Opc = ARMISD::SMLALDX;
10712 else if (IntNo == Intrinsic::arm_smlsld)
10713 Opc = ARMISD::SMLSLD;
10714 else if (IntNo == Intrinsic::arm_smlsldx)
10715 Opc = ARMISD::SMLSLDX;
10716 else
10717 return;
10718
10719 SDLoc dl(N);
10720 SDValue Lo, Hi;
10721 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10722
10723 SDValue LongMul = DAG.getNode(Opc, dl,
10724 DAG.getVTList(MVT::i32, MVT::i32),
10725 N->getOperand(1), N->getOperand(2),
10726 Lo, Hi);
10727 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10728 LongMul.getValue(0), LongMul.getValue(1)));
10729}
10730
10731/// ReplaceNodeResults - Replace the results of node with an illegal result
10732/// type with new values built out of custom code.
10735 SelectionDAG &DAG) const {
10736 SDValue Res;
10737 switch (N->getOpcode()) {
10738 default:
10739 llvm_unreachable("Don't know how to custom expand this!");
10740 case ISD::READ_REGISTER:
10742 break;
10743 case ISD::BITCAST:
10744 Res = ExpandBITCAST(N, DAG, Subtarget);
10745 break;
10746 case ISD::SRL:
10747 case ISD::SRA:
10748 case ISD::SHL:
10749 Res = Expand64BitShift(N, DAG, Subtarget);
10750 break;
10751 case ISD::SREM:
10752 case ISD::UREM:
10753 Res = LowerREM(N, DAG);
10754 break;
10755 case ISD::SDIVREM:
10756 case ISD::UDIVREM:
10757 Res = LowerDivRem(SDValue(N, 0), DAG);
10758 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10759 Results.push_back(Res.getValue(0));
10760 Results.push_back(Res.getValue(1));
10761 return;
10762 case ISD::SADDSAT:
10763 case ISD::SSUBSAT:
10764 case ISD::UADDSAT:
10765 case ISD::USUBSAT:
10766 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10767 break;
10768 case ISD::READCYCLECOUNTER:
10769 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10770 return;
10771 case ISD::UDIV:
10772 case ISD::SDIV:
10773 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
10774 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10775 Results);
10776 case ISD::ATOMIC_CMP_SWAP:
10778 return;
10780 return ReplaceLongIntrinsic(N, Results, DAG);
10781 case ISD::LOAD:
10782 LowerLOAD(N, Results, DAG);
10783 break;
10784 case ISD::TRUNCATE:
10785 Res = LowerTruncate(N, DAG, Subtarget);
10786 break;
10787 case ISD::SIGN_EXTEND:
10788 case ISD::ZERO_EXTEND:
10789 Res = LowerVectorExtend(N, DAG, Subtarget);
10790 break;
10793 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10794 break;
10795 }
10796 if (Res.getNode())
10797 Results.push_back(Res);
10798}
10799
10800//===----------------------------------------------------------------------===//
10801// ARM Scheduler Hooks
10802//===----------------------------------------------------------------------===//
10803
10804/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10805/// registers the function context.
10806void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10808 MachineBasicBlock *DispatchBB,
10809 int FI) const {
10810 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10811 "ROPI/RWPI not currently supported with SjLj");
10812 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10813 DebugLoc dl = MI.getDebugLoc();
10814 MachineFunction *MF = MBB->getParent();
10818 const Function &F = MF->getFunction();
10819
10820 bool isThumb = Subtarget->isThumb();
10821 bool isThumb2 = Subtarget->isThumb2();
10822
10823 unsigned PCLabelId = AFI->createPICLabelUId();
10824 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10826 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10827 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10828
10829 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10830 : &ARM::GPRRegClass;
10831
10832 // Grab constant pool and fixed stack memory operands.
10833 MachineMemOperand *CPMMO =
10836
10837 MachineMemOperand *FIMMOSt =
10840
10841 // Load the address of the dispatch MBB into the jump buffer.
10842 if (isThumb2) {
10843 // Incoming value: jbuf
10844 // ldr.n r5, LCPI1_1
10845 // orr r5, r5, #1
10846 // add r5, pc
10847 // str r5, [$jbuf, #+4] ; &jbuf[1]
10848 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10849 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10851 .addMemOperand(CPMMO)
10853 // Set the low bit because of thumb mode.
10854 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10855 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10856 .addReg(NewVReg1, RegState::Kill)
10857 .addImm(0x01)
10859 .add(condCodeOp());
10860 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10861 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10862 .addReg(NewVReg2, RegState::Kill)
10863 .addImm(PCLabelId);
10864 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10865 .addReg(NewVReg3, RegState::Kill)
10866 .addFrameIndex(FI)
10867 .addImm(36) // &jbuf[1] :: pc
10868 .addMemOperand(FIMMOSt)
10870 } else if (isThumb) {
10871 // Incoming value: jbuf
10872 // ldr.n r1, LCPI1_4
10873 // add r1, pc
10874 // mov r2, #1
10875 // orrs r1, r2
10876 // add r2, $jbuf, #+4 ; &jbuf[1]
10877 // str r1, [r2]
10878 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10879 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10881 .addMemOperand(CPMMO)
10883 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10884 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10885 .addReg(NewVReg1, RegState::Kill)
10886 .addImm(PCLabelId);
10887 // Set the low bit because of thumb mode.
10888 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10889 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10890 .addReg(ARM::CPSR, RegState::Define)
10891 .addImm(1)
10893 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10894 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10895 .addReg(ARM::CPSR, RegState::Define)
10896 .addReg(NewVReg2, RegState::Kill)
10897 .addReg(NewVReg3, RegState::Kill)
10899 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10900 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10901 .addFrameIndex(FI)
10902 .addImm(36); // &jbuf[1] :: pc
10903 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10904 .addReg(NewVReg4, RegState::Kill)
10905 .addReg(NewVReg5, RegState::Kill)
10906 .addImm(0)
10907 .addMemOperand(FIMMOSt)
10909 } else {
10910 // Incoming value: jbuf
10911 // ldr r1, LCPI1_1
10912 // add r1, pc, r1
10913 // str r1, [$jbuf, #+4] ; &jbuf[1]
10914 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10915 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10917 .addImm(0)
10918 .addMemOperand(CPMMO)
10920 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10921 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10922 .addReg(NewVReg1, RegState::Kill)
10923 .addImm(PCLabelId)
10925 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10926 .addReg(NewVReg2, RegState::Kill)
10927 .addFrameIndex(FI)
10928 .addImm(36) // &jbuf[1] :: pc
10929 .addMemOperand(FIMMOSt)
10931 }
10932}
10933
10934void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10935 MachineBasicBlock *MBB) const {
10936 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10937 DebugLoc dl = MI.getDebugLoc();
10938 MachineFunction *MF = MBB->getParent();
10940 MachineFrameInfo &MFI = MF->getFrameInfo();
10941 int FI = MFI.getFunctionContextIndex();
10942
10943 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10944 : &ARM::GPRnopcRegClass;
10945
10946 // Get a mapping of the call site numbers to all of the landing pads they're
10947 // associated with.
10949 unsigned MaxCSNum = 0;
10950 for (MachineBasicBlock &BB : *MF) {
10951 if (!BB.isEHPad())
10952 continue;
10953
10954 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10955 // pad.
10956 for (MachineInstr &II : BB) {
10957 if (!II.isEHLabel())
10958 continue;
10959
10960 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10961 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10962
10963 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10964 for (unsigned Idx : CallSiteIdxs) {
10965 CallSiteNumToLPad[Idx].push_back(&BB);
10966 MaxCSNum = std::max(MaxCSNum, Idx);
10967 }
10968 break;
10969 }
10970 }
10971
10972 // Get an ordered list of the machine basic blocks for the jump table.
10973 std::vector<MachineBasicBlock*> LPadList;
10975 LPadList.reserve(CallSiteNumToLPad.size());
10976 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10977 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10978 for (MachineBasicBlock *MBB : MBBList) {
10979 LPadList.push_back(MBB);
10980 InvokeBBs.insert(MBB->pred_begin(), MBB->pred_end());
10981 }
10982 }
10983
10984 assert(!LPadList.empty() &&
10985 "No landing pad destinations for the dispatch jump table!");
10986
10987 // Create the jump table and associated information.
10989 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
10990 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
10991
10992 // Create the MBBs for the dispatch code.
10993
10994 // Shove the dispatch's address into the return slot in the function context.
10995 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
10996 DispatchBB->setIsEHPad();
10997
10998 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
10999 unsigned trap_opcode;
11000 if (Subtarget->isThumb())
11001 trap_opcode = ARM::tTRAP;
11002 else
11003 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
11004
11005 BuildMI(TrapBB, dl, TII->get(trap_opcode));
11006 DispatchBB->addSuccessor(TrapBB);
11007
11008 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
11009 DispatchBB->addSuccessor(DispContBB);
11010
11011 // Insert and MBBs.
11012 MF->insert(MF->end(), DispatchBB);
11013 MF->insert(MF->end(), DispContBB);
11014 MF->insert(MF->end(), TrapBB);
11015
11016 // Insert code into the entry block that creates and registers the function
11017 // context.
11018 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
11019
11020 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
11023
11025 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
11026
11027 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
11028 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
11029
11030 // Add a register mask with no preserved registers. This results in all
11031 // registers being marked as clobbered. This can't work if the dispatch block
11032 // is in a Thumb1 function and is linked with ARM code which uses the FP
11033 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
11035
11036 bool IsPositionIndependent = isPositionIndependent();
11037 unsigned NumLPads = LPadList.size();
11038 if (Subtarget->isThumb2()) {
11039 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11040 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
11041 .addFrameIndex(FI)
11042 .addImm(4)
11043 .addMemOperand(FIMMOLd)
11045
11046 if (NumLPads < 256) {
11047 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
11048 .addReg(NewVReg1)
11049 .addImm(LPadList.size())
11051 } else {
11052 Register VReg1 = MRI->createVirtualRegister(TRC);
11053 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
11054 .addImm(NumLPads & 0xFFFF)
11056
11057 unsigned VReg2 = VReg1;
11058 if ((NumLPads & 0xFFFF0000) != 0) {
11059 VReg2 = MRI->createVirtualRegister(TRC);
11060 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
11061 .addReg(VReg1)
11062 .addImm(NumLPads >> 16)
11064 }
11065
11066 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
11067 .addReg(NewVReg1)
11068 .addReg(VReg2)
11070 }
11071
11072 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
11073 .addMBB(TrapBB)
11075 .addReg(ARM::CPSR);
11076
11077 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11078 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
11079 .addJumpTableIndex(MJTI)
11081
11082 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11083 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
11084 .addReg(NewVReg3, RegState::Kill)
11085 .addReg(NewVReg1)
11088 .add(condCodeOp());
11089
11090 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
11091 .addReg(NewVReg4, RegState::Kill)
11092 .addReg(NewVReg1)
11093 .addJumpTableIndex(MJTI);
11094 } else if (Subtarget->isThumb()) {
11095 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11096 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
11097 .addFrameIndex(FI)
11098 .addImm(1)
11099 .addMemOperand(FIMMOLd)
11101
11102 if (NumLPads < 256) {
11103 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
11104 .addReg(NewVReg1)
11105 .addImm(NumLPads)
11107 } else {
11108 MachineConstantPool *ConstantPool = MF->getConstantPool();
11109 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11110 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11111
11112 // MachineConstantPool wants an explicit alignment.
11113 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11114 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11115
11116 Register VReg1 = MRI->createVirtualRegister(TRC);
11117 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
11118 .addReg(VReg1, RegState::Define)
11121 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
11122 .addReg(NewVReg1)
11123 .addReg(VReg1)
11125 }
11126
11127 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
11128 .addMBB(TrapBB)
11130 .addReg(ARM::CPSR);
11131
11132 Register NewVReg2 = MRI->createVirtualRegister(TRC);
11133 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
11134 .addReg(ARM::CPSR, RegState::Define)
11135 .addReg(NewVReg1)
11136 .addImm(2)
11138
11139 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11140 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
11141 .addJumpTableIndex(MJTI)
11143
11144 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11145 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
11146 .addReg(ARM::CPSR, RegState::Define)
11147 .addReg(NewVReg2, RegState::Kill)
11148 .addReg(NewVReg3)
11150
11151 MachineMemOperand *JTMMOLd =
11152 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11154
11155 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11156 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
11157 .addReg(NewVReg4, RegState::Kill)
11158 .addImm(0)
11159 .addMemOperand(JTMMOLd)
11161
11162 unsigned NewVReg6 = NewVReg5;
11163 if (IsPositionIndependent) {
11164 NewVReg6 = MRI->createVirtualRegister(TRC);
11165 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
11166 .addReg(ARM::CPSR, RegState::Define)
11167 .addReg(NewVReg5, RegState::Kill)
11168 .addReg(NewVReg3)
11170 }
11171
11172 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
11173 .addReg(NewVReg6, RegState::Kill)
11174 .addJumpTableIndex(MJTI);
11175 } else {
11176 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11177 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
11178 .addFrameIndex(FI)
11179 .addImm(4)
11180 .addMemOperand(FIMMOLd)
11182
11183 if (NumLPads < 256) {
11184 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
11185 .addReg(NewVReg1)
11186 .addImm(NumLPads)
11188 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
11189 Register VReg1 = MRI->createVirtualRegister(TRC);
11190 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
11191 .addImm(NumLPads & 0xFFFF)
11193
11194 unsigned VReg2 = VReg1;
11195 if ((NumLPads & 0xFFFF0000) != 0) {
11196 VReg2 = MRI->createVirtualRegister(TRC);
11197 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
11198 .addReg(VReg1)
11199 .addImm(NumLPads >> 16)
11201 }
11202
11203 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11204 .addReg(NewVReg1)
11205 .addReg(VReg2)
11207 } else {
11208 MachineConstantPool *ConstantPool = MF->getConstantPool();
11209 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11210 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11211
11212 // MachineConstantPool wants an explicit alignment.
11213 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11214 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11215
11216 Register VReg1 = MRI->createVirtualRegister(TRC);
11217 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
11218 .addReg(VReg1, RegState::Define)
11220 .addImm(0)
11222 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11223 .addReg(NewVReg1)
11224 .addReg(VReg1, RegState::Kill)
11226 }
11227
11228 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
11229 .addMBB(TrapBB)
11231 .addReg(ARM::CPSR);
11232
11233 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11234 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
11235 .addReg(NewVReg1)
11238 .add(condCodeOp());
11239 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11240 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
11241 .addJumpTableIndex(MJTI)
11243
11244 MachineMemOperand *JTMMOLd =
11245 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11247 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11248 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11249 .addReg(NewVReg3, RegState::Kill)
11250 .addReg(NewVReg4)
11251 .addImm(0)
11252 .addMemOperand(JTMMOLd)
11254
11255 if (IsPositionIndependent) {
11256 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11257 .addReg(NewVReg5, RegState::Kill)
11258 .addReg(NewVReg4)
11259 .addJumpTableIndex(MJTI);
11260 } else {
11261 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11262 .addReg(NewVReg5, RegState::Kill)
11263 .addJumpTableIndex(MJTI);
11264 }
11265 }
11266
11267 // Add the jump table entries as successors to the MBB.
11269 for (MachineBasicBlock *CurMBB : LPadList) {
11270 if (SeenMBBs.insert(CurMBB).second)
11271 DispContBB->addSuccessor(CurMBB);
11272 }
11273
11274 // N.B. the order the invoke BBs are processed in doesn't matter here.
11275 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11277 for (MachineBasicBlock *BB : InvokeBBs) {
11278
11279 // Remove the landing pad successor from the invoke block and replace it
11280 // with the new dispatch block.
11281 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11282 while (!Successors.empty()) {
11283 MachineBasicBlock *SMBB = Successors.pop_back_val();
11284 if (SMBB->isEHPad()) {
11285 BB->removeSuccessor(SMBB);
11286 MBBLPads.push_back(SMBB);
11287 }
11288 }
11289
11290 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11291 BB->normalizeSuccProbs();
11292
11293 // Find the invoke call and mark all of the callee-saved registers as
11294 // 'implicit defined' so that they're spilled. This prevents code from
11295 // moving instructions to before the EH block, where they will never be
11296 // executed.
11298 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11299 if (!II->isCall()) continue;
11300
11303 OI = II->operands_begin(), OE = II->operands_end();
11304 OI != OE; ++OI) {
11305 if (!OI->isReg()) continue;
11306 DefRegs[OI->getReg()] = true;
11307 }
11308
11309 MachineInstrBuilder MIB(*MF, &*II);
11310
11311 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11312 unsigned Reg = SavedRegs[i];
11313 if (Subtarget->isThumb2() &&
11314 !ARM::tGPRRegClass.contains(Reg) &&
11315 !ARM::hGPRRegClass.contains(Reg))
11316 continue;
11317 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11318 continue;
11319 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11320 continue;
11321 if (!DefRegs[Reg])
11323 }
11324
11325 break;
11326 }
11327 }
11328
11329 // Mark all former landing pads as non-landing pads. The dispatch is the only
11330 // landing pad now.
11331 for (MachineBasicBlock *MBBLPad : MBBLPads)
11332 MBBLPad->setIsEHPad(false);
11333
11334 // The instruction is gone now.
11335 MI.eraseFromParent();
11336}
11337
11338static
11340 for (MachineBasicBlock *S : MBB->successors())
11341 if (S != Succ)
11342 return S;
11343 llvm_unreachable("Expecting a BB with two successors!");
11344}
11345
11346/// Return the load opcode for a given load size. If load size >= 8,
11347/// neon opcode will be returned.
11348static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11349 if (LdSize >= 8)
11350 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11351 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11352 if (IsThumb1)
11353 return LdSize == 4 ? ARM::tLDRi
11354 : LdSize == 2 ? ARM::tLDRHi
11355 : LdSize == 1 ? ARM::tLDRBi : 0;
11356 if (IsThumb2)
11357 return LdSize == 4 ? ARM::t2LDR_POST
11358 : LdSize == 2 ? ARM::t2LDRH_POST
11359 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11360 return LdSize == 4 ? ARM::LDR_POST_IMM
11361 : LdSize == 2 ? ARM::LDRH_POST
11362 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11363}
11364
11365/// Return the store opcode for a given store size. If store size >= 8,
11366/// neon opcode will be returned.
11367static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11368 if (StSize >= 8)
11369 return StSize == 16 ? ARM::VST1q32wb_fixed
11370 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11371 if (IsThumb1)
11372 return StSize == 4 ? ARM::tSTRi
11373 : StSize == 2 ? ARM::tSTRHi
11374 : StSize == 1 ? ARM::tSTRBi : 0;
11375 if (IsThumb2)
11376 return StSize == 4 ? ARM::t2STR_POST
11377 : StSize == 2 ? ARM::t2STRH_POST
11378 : StSize == 1 ? ARM::t2STRB_POST : 0;
11379 return StSize == 4 ? ARM::STR_POST_IMM
11380 : StSize == 2 ? ARM::STRH_POST
11381 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11382}
11383
11384/// Emit a post-increment load operation with given size. The instructions
11385/// will be added to BB at Pos.
11387 const TargetInstrInfo *TII, const DebugLoc &dl,
11388 unsigned LdSize, unsigned Data, unsigned AddrIn,
11389 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11390 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11391 assert(LdOpc != 0 && "Should have a load opcode");
11392 if (LdSize >= 8) {
11393 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11394 .addReg(AddrOut, RegState::Define)
11395 .addReg(AddrIn)
11396 .addImm(0)
11398 } else if (IsThumb1) {
11399 // load + update AddrIn
11400 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11401 .addReg(AddrIn)
11402 .addImm(0)
11404 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11405 .add(t1CondCodeOp())
11406 .addReg(AddrIn)
11407 .addImm(LdSize)
11409 } else if (IsThumb2) {
11410 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11411 .addReg(AddrOut, RegState::Define)
11412 .addReg(AddrIn)
11413 .addImm(LdSize)
11415 } else { // arm
11416 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11417 .addReg(AddrOut, RegState::Define)
11418 .addReg(AddrIn)
11419 .addReg(0)
11420 .addImm(LdSize)
11422 }
11423}
11424
11425/// Emit a post-increment store operation with given size. The instructions
11426/// will be added to BB at Pos.
11428 const TargetInstrInfo *TII, const DebugLoc &dl,
11429 unsigned StSize, unsigned Data, unsigned AddrIn,
11430 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11431 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11432 assert(StOpc != 0 && "Should have a store opcode");
11433 if (StSize >= 8) {
11434 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11435 .addReg(AddrIn)
11436 .addImm(0)
11437 .addReg(Data)
11439 } else if (IsThumb1) {
11440 // store + update AddrIn
11441 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11442 .addReg(Data)
11443 .addReg(AddrIn)
11444 .addImm(0)
11446 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11447 .add(t1CondCodeOp())
11448 .addReg(AddrIn)
11449 .addImm(StSize)
11451 } else if (IsThumb2) {
11452 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11453 .addReg(Data)
11454 .addReg(AddrIn)
11455 .addImm(StSize)
11457 } else { // arm
11458 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11459 .addReg(Data)
11460 .addReg(AddrIn)
11461 .addReg(0)
11462 .addImm(StSize)
11464 }
11465}
11466
11468ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11469 MachineBasicBlock *BB) const {
11470 // This pseudo instruction has 3 operands: dst, src, size
11471 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11472 // Otherwise, we will generate unrolled scalar copies.
11473 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11474 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11476
11477 Register dest = MI.getOperand(0).getReg();
11478 Register src = MI.getOperand(1).getReg();
11479 unsigned SizeVal = MI.getOperand(2).getImm();
11480 unsigned Alignment = MI.getOperand(3).getImm();
11481 DebugLoc dl = MI.getDebugLoc();
11482
11483 MachineFunction *MF = BB->getParent();
11485 unsigned UnitSize = 0;
11486 const TargetRegisterClass *TRC = nullptr;
11487 const TargetRegisterClass *VecTRC = nullptr;
11488
11489 bool IsThumb1 = Subtarget->isThumb1Only();
11490 bool IsThumb2 = Subtarget->isThumb2();
11491 bool IsThumb = Subtarget->isThumb();
11492
11493 if (Alignment & 1) {
11494 UnitSize = 1;
11495 } else if (Alignment & 2) {
11496 UnitSize = 2;
11497 } else {
11498 // Check whether we can use NEON instructions.
11499 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11500 Subtarget->hasNEON()) {
11501 if ((Alignment % 16 == 0) && SizeVal >= 16)
11502 UnitSize = 16;
11503 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11504 UnitSize = 8;
11505 }
11506 // Can't use NEON instructions.
11507 if (UnitSize == 0)
11508 UnitSize = 4;
11509 }
11510
11511 // Select the correct opcode and register class for unit size load/store
11512 bool IsNeon = UnitSize >= 8;
11513 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11514 if (IsNeon)
11515 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11516 : UnitSize == 8 ? &ARM::DPRRegClass
11517 : nullptr;
11518
11519 unsigned BytesLeft = SizeVal % UnitSize;
11520 unsigned LoopSize = SizeVal - BytesLeft;
11521
11522 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11523 // Use LDR and STR to copy.
11524 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11525 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11526 unsigned srcIn = src;
11527 unsigned destIn = dest;
11528 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11529 Register srcOut = MRI.createVirtualRegister(TRC);
11530 Register destOut = MRI.createVirtualRegister(TRC);
11531 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11532 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11533 IsThumb1, IsThumb2);
11534 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11535 IsThumb1, IsThumb2);
11536 srcIn = srcOut;
11537 destIn = destOut;
11538 }
11539
11540 // Handle the leftover bytes with LDRB and STRB.
11541 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11542 // [destOut] = STRB_POST(scratch, destIn, 1)
11543 for (unsigned i = 0; i < BytesLeft; i++) {
11544 Register srcOut = MRI.createVirtualRegister(TRC);
11545 Register destOut = MRI.createVirtualRegister(TRC);
11546 Register scratch = MRI.createVirtualRegister(TRC);
11547 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11548 IsThumb1, IsThumb2);
11549 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11550 IsThumb1, IsThumb2);
11551 srcIn = srcOut;
11552 destIn = destOut;
11553 }
11554 MI.eraseFromParent(); // The instruction is gone now.
11555 return BB;
11556 }
11557
11558 // Expand the pseudo op to a loop.
11559 // thisMBB:
11560 // ...
11561 // movw varEnd, # --> with thumb2
11562 // movt varEnd, #
11563 // ldrcp varEnd, idx --> without thumb2
11564 // fallthrough --> loopMBB
11565 // loopMBB:
11566 // PHI varPhi, varEnd, varLoop
11567 // PHI srcPhi, src, srcLoop
11568 // PHI destPhi, dst, destLoop
11569 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11570 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11571 // subs varLoop, varPhi, #UnitSize
11572 // bne loopMBB
11573 // fallthrough --> exitMBB
11574 // exitMBB:
11575 // epilogue to handle left-over bytes
11576 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11577 // [destOut] = STRB_POST(scratch, destLoop, 1)
11578 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11579 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11580 MF->insert(It, loopMBB);
11581 MF->insert(It, exitMBB);
11582
11583 // Set the call frame size on entry to the new basic blocks.
11584 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11585 loopMBB->setCallFrameSize(CallFrameSize);
11586 exitMBB->setCallFrameSize(CallFrameSize);
11587
11588 // Transfer the remainder of BB and its successor edges to exitMBB.
11589 exitMBB->splice(exitMBB->begin(), BB,
11590 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11592
11593 // Load an immediate to varEnd.
11594 Register varEnd = MRI.createVirtualRegister(TRC);
11595 if (Subtarget->useMovt()) {
11596 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11597 varEnd)
11598 .addImm(LoopSize);
11599 } else if (Subtarget->genExecuteOnly()) {
11600 assert(IsThumb && "Non-thumb expected to have used movt");
11601 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11602 } else {
11605 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11606
11607 // MachineConstantPool wants an explicit alignment.
11608 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11609 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11610 MachineMemOperand *CPMMO =
11613
11614 if (IsThumb)
11615 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11616 .addReg(varEnd, RegState::Define)
11619 .addMemOperand(CPMMO);
11620 else
11621 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11622 .addReg(varEnd, RegState::Define)
11624 .addImm(0)
11626 .addMemOperand(CPMMO);
11627 }
11628 BB->addSuccessor(loopMBB);
11629
11630 // Generate the loop body:
11631 // varPhi = PHI(varLoop, varEnd)
11632 // srcPhi = PHI(srcLoop, src)
11633 // destPhi = PHI(destLoop, dst)
11634 MachineBasicBlock *entryBB = BB;
11635 BB = loopMBB;
11636 Register varLoop = MRI.createVirtualRegister(TRC);
11637 Register varPhi = MRI.createVirtualRegister(TRC);
11638 Register srcLoop = MRI.createVirtualRegister(TRC);
11639 Register srcPhi = MRI.createVirtualRegister(TRC);
11640 Register destLoop = MRI.createVirtualRegister(TRC);
11641 Register destPhi = MRI.createVirtualRegister(TRC);
11642
11643 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11644 .addReg(varLoop).addMBB(loopMBB)
11645 .addReg(varEnd).addMBB(entryBB);
11646 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11647 .addReg(srcLoop).addMBB(loopMBB)
11648 .addReg(src).addMBB(entryBB);
11649 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11650 .addReg(destLoop).addMBB(loopMBB)
11651 .addReg(dest).addMBB(entryBB);
11652
11653 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11654 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11655 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11656 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11657 IsThumb1, IsThumb2);
11658 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11659 IsThumb1, IsThumb2);
11660
11661 // Decrement loop variable by UnitSize.
11662 if (IsThumb1) {
11663 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11664 .add(t1CondCodeOp())
11665 .addReg(varPhi)
11666 .addImm(UnitSize)
11668 } else {
11670 BuildMI(*BB, BB->end(), dl,
11671 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11672 MIB.addReg(varPhi)
11673 .addImm(UnitSize)
11675 .add(condCodeOp());
11676 MIB->getOperand(5).setReg(ARM::CPSR);
11677 MIB->getOperand(5).setIsDef(true);
11678 }
11679 BuildMI(*BB, BB->end(), dl,
11680 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11681 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11682
11683 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11684 BB->addSuccessor(loopMBB);
11685 BB->addSuccessor(exitMBB);
11686
11687 // Add epilogue to handle BytesLeft.
11688 BB = exitMBB;
11689 auto StartOfExit = exitMBB->begin();
11690
11691 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11692 // [destOut] = STRB_POST(scratch, destLoop, 1)
11693 unsigned srcIn = srcLoop;
11694 unsigned destIn = destLoop;
11695 for (unsigned i = 0; i < BytesLeft; i++) {
11696 Register srcOut = MRI.createVirtualRegister(TRC);
11697 Register destOut = MRI.createVirtualRegister(TRC);
11698 Register scratch = MRI.createVirtualRegister(TRC);
11699 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11700 IsThumb1, IsThumb2);
11701 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11702 IsThumb1, IsThumb2);
11703 srcIn = srcOut;
11704 destIn = destOut;
11705 }
11706
11707 MI.eraseFromParent(); // The instruction is gone now.
11708 return BB;
11709}
11710
11712ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11713 MachineBasicBlock *MBB) const {
11715 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11716 DebugLoc DL = MI.getDebugLoc();
11717
11718 assert(Subtarget->isTargetWindows() &&
11719 "__chkstk is only supported on Windows");
11720 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11721
11722 // __chkstk takes the number of words to allocate on the stack in R4, and
11723 // returns the stack adjustment in number of bytes in R4. This will not
11724 // clober any other registers (other than the obvious lr).
11725 //
11726 // Although, technically, IP should be considered a register which may be
11727 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11728 // thumb-2 environment, so there is no interworking required. As a result, we
11729 // do not expect a veneer to be emitted by the linker, clobbering IP.
11730 //
11731 // Each module receives its own copy of __chkstk, so no import thunk is
11732 // required, again, ensuring that IP is not clobbered.
11733 //
11734 // Finally, although some linkers may theoretically provide a trampoline for
11735 // out of range calls (which is quite common due to a 32M range limitation of
11736 // branches for Thumb), we can generate the long-call version via
11737 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11738 // IP.
11739
11740 switch (TM.getCodeModel()) {
11741 case CodeModel::Tiny:
11742 llvm_unreachable("Tiny code model not available on ARM.");
11743 case CodeModel::Small:
11744 case CodeModel::Medium:
11745 case CodeModel::Kernel:
11746 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11748 .addExternalSymbol("__chkstk")
11751 .addReg(ARM::R12,
11753 .addReg(ARM::CPSR,
11755 break;
11756 case CodeModel::Large: {
11758 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11759
11760 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11761 .addExternalSymbol("__chkstk");
11764 .addReg(Reg, RegState::Kill)
11767 .addReg(ARM::R12,
11769 .addReg(ARM::CPSR,
11771 break;
11772 }
11773 }
11774
11775 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11776 .addReg(ARM::SP, RegState::Kill)
11777 .addReg(ARM::R4, RegState::Kill)
11780 .add(condCodeOp());
11781
11782 MI.eraseFromParent();
11783 return MBB;
11784}
11785
11787ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11788 MachineBasicBlock *MBB) const {
11789 DebugLoc DL = MI.getDebugLoc();
11790 MachineFunction *MF = MBB->getParent();
11791 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11792
11794 MF->insert(++MBB->getIterator(), ContBB);
11795 ContBB->splice(ContBB->begin(), MBB,
11796 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11798 MBB->addSuccessor(ContBB);
11799
11801 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11802 MF->push_back(TrapBB);
11803 MBB->addSuccessor(TrapBB);
11804
11805 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11806 .addReg(MI.getOperand(0).getReg())
11807 .addImm(0)
11809 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11810 .addMBB(TrapBB)
11812 .addReg(ARM::CPSR);
11813
11814 MI.eraseFromParent();
11815 return ContBB;
11816}
11817
11818// The CPSR operand of SelectItr might be missing a kill marker
11819// because there were multiple uses of CPSR, and ISel didn't know
11820// which to mark. Figure out whether SelectItr should have had a
11821// kill marker, and set it if it should. Returns the correct kill
11822// marker value.
11825 const TargetRegisterInfo* TRI) {
11826 // Scan forward through BB for a use/def of CPSR.
11827 MachineBasicBlock::iterator miI(std::next(SelectItr));
11828 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11829 const MachineInstr& mi = *miI;
11830 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11831 return false;
11832 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11833 break; // Should have kill-flag - update below.
11834 }
11835
11836 // If we hit the end of the block, check whether CPSR is live into a
11837 // successor.
11838 if (miI == BB->end()) {
11839 for (MachineBasicBlock *Succ : BB->successors())
11840 if (Succ->isLiveIn(ARM::CPSR))
11841 return false;
11842 }
11843
11844 // We found a def, or hit the end of the basic block and CPSR wasn't live
11845 // out. SelectMI should have a kill flag on CPSR.
11846 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11847 return true;
11848}
11849
11850/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11851/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11853 MachineBasicBlock *TpLoopBody,
11854 MachineBasicBlock *TpExit, Register OpSizeReg,
11855 const TargetInstrInfo *TII, DebugLoc Dl,
11857 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11858 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11859 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11860 .addUse(OpSizeReg)
11861 .addImm(15)
11863 .addReg(0);
11864
11865 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11866 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11867 .addUse(AddDestReg, RegState::Kill)
11868 .addImm(4)
11870 .addReg(0);
11871
11872 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11873 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11874 .addUse(LsrDestReg, RegState::Kill);
11875
11876 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11877 .addUse(TotalIterationsReg)
11878 .addMBB(TpExit);
11879
11880 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11881 .addMBB(TpLoopBody)
11883
11884 return TotalIterationsReg;
11885}
11886
11887/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11888/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11889/// loops.
11890static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11891 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11892 const TargetInstrInfo *TII, DebugLoc Dl,
11893 MachineRegisterInfo &MRI, Register OpSrcReg,
11894 Register OpDestReg, Register ElementCountReg,
11895 Register TotalIterationsReg, bool IsMemcpy) {
11896 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11897 // array, loop iteration counter, predication counter.
11898
11899 Register SrcPhiReg, CurrSrcReg;
11900 if (IsMemcpy) {
11901 // Current position in the src array
11902 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11903 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11904 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11905 .addUse(OpSrcReg)
11906 .addMBB(TpEntry)
11907 .addUse(CurrSrcReg)
11908 .addMBB(TpLoopBody);
11909 }
11910
11911 // Current position in the dest array
11912 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11913 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11914 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11915 .addUse(OpDestReg)
11916 .addMBB(TpEntry)
11917 .addUse(CurrDestReg)
11918 .addMBB(TpLoopBody);
11919
11920 // Current loop counter
11921 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11922 Register RemainingLoopIterationsReg =
11923 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11924 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11925 .addUse(TotalIterationsReg)
11926 .addMBB(TpEntry)
11927 .addUse(RemainingLoopIterationsReg)
11928 .addMBB(TpLoopBody);
11929
11930 // Predication counter
11931 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11932 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11933 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11934 .addUse(ElementCountReg)
11935 .addMBB(TpEntry)
11936 .addUse(RemainingElementsReg)
11937 .addMBB(TpLoopBody);
11938
11939 // Pass predication counter to VCTP
11940 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11941 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11942 .addUse(PredCounterPhiReg)
11944 .addReg(0)
11945 .addReg(0);
11946
11947 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11948 .addUse(PredCounterPhiReg)
11949 .addImm(16)
11951 .addReg(0);
11952
11953 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11954 Register SrcValueReg;
11955 if (IsMemcpy) {
11956 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11957 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11958 .addDef(CurrSrcReg)
11959 .addDef(SrcValueReg)
11960 .addReg(SrcPhiReg)
11961 .addImm(16)
11963 .addUse(VccrReg)
11964 .addReg(0);
11965 } else
11966 SrcValueReg = OpSrcReg;
11967
11968 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11969 .addDef(CurrDestReg)
11970 .addUse(SrcValueReg)
11971 .addReg(DestPhiReg)
11972 .addImm(16)
11974 .addUse(VccrReg)
11975 .addReg(0);
11976
11977 // Add the pseudoInstrs for decrementing the loop counter and marking the
11978 // end:t2DoLoopDec and t2DoLoopEnd
11979 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11980 .addUse(LoopCounterPhiReg)
11981 .addImm(1);
11982
11983 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11984 .addUse(RemainingLoopIterationsReg)
11985 .addMBB(TpLoopBody);
11986
11987 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11988 .addMBB(TpExit)
11990}
11991
11994 MachineBasicBlock *BB) const {
11995 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11996 DebugLoc dl = MI.getDebugLoc();
11997 bool isThumb2 = Subtarget->isThumb2();
11998 switch (MI.getOpcode()) {
11999 default: {
12000 MI.print(errs());
12001 llvm_unreachable("Unexpected instr type to insert");
12002 }
12003
12004 // Thumb1 post-indexed loads are really just single-register LDMs.
12005 case ARM::tLDR_postidx: {
12006 MachineOperand Def(MI.getOperand(1));
12007 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
12008 .add(Def) // Rn_wb
12009 .add(MI.getOperand(2)) // Rn
12010 .add(MI.getOperand(3)) // PredImm
12011 .add(MI.getOperand(4)) // PredReg
12012 .add(MI.getOperand(0)) // Rt
12013 .cloneMemRefs(MI);
12014 MI.eraseFromParent();
12015 return BB;
12016 }
12017
12018 case ARM::MVE_MEMCPYLOOPINST:
12019 case ARM::MVE_MEMSETLOOPINST: {
12020
12021 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
12022 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
12023 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
12024 // adds the relevant instructions in the TP loop Body for generation of a
12025 // WLSTP loop.
12026
12027 // Below is relevant portion of the CFG after the transformation.
12028 // The Machine Basic Blocks are shown along with branch conditions (in
12029 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
12030 // portion of the CFG and may not necessarily be the entry/exit of the
12031 // function.
12032
12033 // (Relevant) CFG after transformation:
12034 // TP entry MBB
12035 // |
12036 // |-----------------|
12037 // (n <= 0) (n > 0)
12038 // | |
12039 // | TP loop Body MBB<--|
12040 // | | |
12041 // \ |___________|
12042 // \ /
12043 // TP exit MBB
12044
12045 MachineFunction *MF = BB->getParent();
12046 MachineFunctionProperties &Properties = MF->getProperties();
12048
12049 Register OpDestReg = MI.getOperand(0).getReg();
12050 Register OpSrcReg = MI.getOperand(1).getReg();
12051 Register OpSizeReg = MI.getOperand(2).getReg();
12052
12053 // Allocate the required MBBs and add to parent function.
12054 MachineBasicBlock *TpEntry = BB;
12055 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
12056 MachineBasicBlock *TpExit;
12057
12058 MF->push_back(TpLoopBody);
12059
12060 // If any instructions are present in the current block after
12061 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
12062 // move the instructions into the newly created exit block. If there are no
12063 // instructions add an explicit branch to the FallThrough block and then
12064 // split.
12065 //
12066 // The split is required for two reasons:
12067 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
12068 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
12069 // need to be updated. splitAt() already handles this.
12070 TpExit = BB->splitAt(MI, false);
12071 if (TpExit == BB) {
12072 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
12073 "block containing memcpy/memset Pseudo");
12074 TpExit = BB->getFallThrough();
12075 BuildMI(BB, dl, TII->get(ARM::t2B))
12076 .addMBB(TpExit)
12078 TpExit = BB->splitAt(MI, false);
12079 }
12080
12081 // Add logic for iteration count
12082 Register TotalIterationsReg =
12083 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
12084
12085 // Add the vectorized (and predicated) loads/store instructions
12086 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
12087 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
12088 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
12089
12090 // Required to avoid conflict with the MachineVerifier during testing.
12092
12093 // Connect the blocks
12094 TpEntry->addSuccessor(TpLoopBody);
12095 TpLoopBody->addSuccessor(TpLoopBody);
12096 TpLoopBody->addSuccessor(TpExit);
12097
12098 // Reorder for a more natural layout
12099 TpLoopBody->moveAfter(TpEntry);
12100 TpExit->moveAfter(TpLoopBody);
12101
12102 // Finally, remove the memcpy Pseudo Instruction
12103 MI.eraseFromParent();
12104
12105 // Return the exit block as it may contain other instructions requiring a
12106 // custom inserter
12107 return TpExit;
12108 }
12109
12110 // The Thumb2 pre-indexed stores have the same MI operands, they just
12111 // define them differently in the .td files from the isel patterns, so
12112 // they need pseudos.
12113 case ARM::t2STR_preidx:
12114 MI.setDesc(TII->get(ARM::t2STR_PRE));
12115 return BB;
12116 case ARM::t2STRB_preidx:
12117 MI.setDesc(TII->get(ARM::t2STRB_PRE));
12118 return BB;
12119 case ARM::t2STRH_preidx:
12120 MI.setDesc(TII->get(ARM::t2STRH_PRE));
12121 return BB;
12122
12123 case ARM::STRi_preidx:
12124 case ARM::STRBi_preidx: {
12125 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
12126 : ARM::STRB_PRE_IMM;
12127 // Decode the offset.
12128 unsigned Offset = MI.getOperand(4).getImm();
12129 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
12131 if (isSub)
12132 Offset = -Offset;
12133
12134 MachineMemOperand *MMO = *MI.memoperands_begin();
12135 BuildMI(*BB, MI, dl, TII->get(NewOpc))
12136 .add(MI.getOperand(0)) // Rn_wb
12137 .add(MI.getOperand(1)) // Rt
12138 .add(MI.getOperand(2)) // Rn
12139 .addImm(Offset) // offset (skip GPR==zero_reg)
12140 .add(MI.getOperand(5)) // pred
12141 .add(MI.getOperand(6))
12142 .addMemOperand(MMO);
12143 MI.eraseFromParent();
12144 return BB;
12145 }
12146 case ARM::STRr_preidx:
12147 case ARM::STRBr_preidx:
12148 case ARM::STRH_preidx: {
12149 unsigned NewOpc;
12150 switch (MI.getOpcode()) {
12151 default: llvm_unreachable("unexpected opcode!");
12152 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
12153 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
12154 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
12155 }
12156 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
12157 for (const MachineOperand &MO : MI.operands())
12158 MIB.add(MO);
12159 MI.eraseFromParent();
12160 return BB;
12161 }
12162
12163 case ARM::tMOVCCr_pseudo: {
12164 // To "insert" a SELECT_CC instruction, we actually have to insert the
12165 // diamond control-flow pattern. The incoming instruction knows the
12166 // destination vreg to set, the condition code register to branch on, the
12167 // true/false values to select between, and a branch opcode to use.
12168 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12170
12171 // thisMBB:
12172 // ...
12173 // TrueVal = ...
12174 // cmpTY ccX, r1, r2
12175 // bCC copy1MBB
12176 // fallthrough --> copy0MBB
12177 MachineBasicBlock *thisMBB = BB;
12178 MachineFunction *F = BB->getParent();
12179 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12180 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12181 F->insert(It, copy0MBB);
12182 F->insert(It, sinkMBB);
12183
12184 // Set the call frame size on entry to the new basic blocks.
12185 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12186 copy0MBB->setCallFrameSize(CallFrameSize);
12187 sinkMBB->setCallFrameSize(CallFrameSize);
12188
12189 // Check whether CPSR is live past the tMOVCCr_pseudo.
12190 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12191 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12192 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12193 copy0MBB->addLiveIn(ARM::CPSR);
12194 sinkMBB->addLiveIn(ARM::CPSR);
12195 }
12196
12197 // Transfer the remainder of BB and its successor edges to sinkMBB.
12198 sinkMBB->splice(sinkMBB->begin(), BB,
12199 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12201
12202 BB->addSuccessor(copy0MBB);
12203 BB->addSuccessor(sinkMBB);
12204
12205 BuildMI(BB, dl, TII->get(ARM::tBcc))
12206 .addMBB(sinkMBB)
12207 .addImm(MI.getOperand(3).getImm())
12208 .addReg(MI.getOperand(4).getReg());
12209
12210 // copy0MBB:
12211 // %FalseValue = ...
12212 // # fallthrough to sinkMBB
12213 BB = copy0MBB;
12214
12215 // Update machine-CFG edges
12216 BB->addSuccessor(sinkMBB);
12217
12218 // sinkMBB:
12219 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12220 // ...
12221 BB = sinkMBB;
12222 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12223 .addReg(MI.getOperand(1).getReg())
12224 .addMBB(copy0MBB)
12225 .addReg(MI.getOperand(2).getReg())
12226 .addMBB(thisMBB);
12227
12228 MI.eraseFromParent(); // The pseudo instruction is gone now.
12229 return BB;
12230 }
12231
12232 case ARM::BCCi64:
12233 case ARM::BCCZi64: {
12234 // If there is an unconditional branch to the other successor, remove it.
12235 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12236
12237 // Compare both parts that make up the double comparison separately for
12238 // equality.
12239 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12240
12241 Register LHS1 = MI.getOperand(1).getReg();
12242 Register LHS2 = MI.getOperand(2).getReg();
12243 if (RHSisZero) {
12244 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12245 .addReg(LHS1)
12246 .addImm(0)
12248 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12249 .addReg(LHS2).addImm(0)
12250 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12251 } else {
12252 Register RHS1 = MI.getOperand(3).getReg();
12253 Register RHS2 = MI.getOperand(4).getReg();
12254 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12255 .addReg(LHS1)
12256 .addReg(RHS1)
12258 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12259 .addReg(LHS2).addReg(RHS2)
12260 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12261 }
12262
12263 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12264 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12265 if (MI.getOperand(0).getImm() == ARMCC::NE)
12266 std::swap(destMBB, exitMBB);
12267
12268 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12269 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12270 if (isThumb2)
12271 BuildMI(BB, dl, TII->get(ARM::t2B))
12272 .addMBB(exitMBB)
12274 else
12275 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12276
12277 MI.eraseFromParent(); // The pseudo instruction is gone now.
12278 return BB;
12279 }
12280
12281 case ARM::Int_eh_sjlj_setjmp:
12282 case ARM::Int_eh_sjlj_setjmp_nofp:
12283 case ARM::tInt_eh_sjlj_setjmp:
12284 case ARM::t2Int_eh_sjlj_setjmp:
12285 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12286 return BB;
12287
12288 case ARM::Int_eh_sjlj_setup_dispatch:
12289 EmitSjLjDispatchBlock(MI, BB);
12290 return BB;
12291
12292 case ARM::ABS:
12293 case ARM::t2ABS: {
12294 // To insert an ABS instruction, we have to insert the
12295 // diamond control-flow pattern. The incoming instruction knows the
12296 // source vreg to test against 0, the destination vreg to set,
12297 // the condition code register to branch on, the
12298 // true/false values to select between, and a branch opcode to use.
12299 // It transforms
12300 // V1 = ABS V0
12301 // into
12302 // V2 = MOVS V0
12303 // BCC (branch to SinkBB if V0 >= 0)
12304 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0)
12305 // SinkBB: V1 = PHI(V2, V3)
12306 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12308 MachineFunction *Fn = BB->getParent();
12309 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
12310 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB);
12311 Fn->insert(BBI, RSBBB);
12312 Fn->insert(BBI, SinkBB);
12313
12314 Register ABSSrcReg = MI.getOperand(1).getReg();
12315 Register ABSDstReg = MI.getOperand(0).getReg();
12316 bool ABSSrcKIll = MI.getOperand(1).isKill();
12317 bool isThumb2 = Subtarget->isThumb2();
12319 // In Thumb mode S must not be specified if source register is the SP or
12320 // PC and if destination register is the SP, so restrict register class
12321 Register NewRsbDstReg = MRI.createVirtualRegister(
12322 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
12323
12324 // Transfer the remainder of BB and its successor edges to sinkMBB.
12325 SinkBB->splice(SinkBB->begin(), BB,
12326 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12328
12329 BB->addSuccessor(RSBBB);
12330 BB->addSuccessor(SinkBB);
12331
12332 // fall through to SinkMBB
12333 RSBBB->addSuccessor(SinkBB);
12334
12335 // insert a cmp at the end of BB
12336 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12337 .addReg(ABSSrcReg)
12338 .addImm(0)
12340
12341 // insert a bcc with opposite CC to ARMCC::MI at the end of BB
12342 BuildMI(BB, dl,
12343 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
12345
12346 // insert rsbri in RSBBB
12347 // Note: BCC and rsbri will be converted into predicated rsbmi
12348 // by if-conversion pass
12349 BuildMI(*RSBBB, RSBBB->begin(), dl,
12350 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
12351 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
12352 .addImm(0)
12354 .add(condCodeOp());
12355
12356 // insert PHI in SinkBB,
12357 // reuse ABSDstReg to not change uses of ABS instruction
12358 BuildMI(*SinkBB, SinkBB->begin(), dl,
12359 TII->get(ARM::PHI), ABSDstReg)
12360 .addReg(NewRsbDstReg).addMBB(RSBBB)
12361 .addReg(ABSSrcReg).addMBB(BB);
12362
12363 // remove ABS instruction
12364 MI.eraseFromParent();
12365
12366 // return last added BB
12367 return SinkBB;
12368 }
12369 case ARM::COPY_STRUCT_BYVAL_I32:
12370 ++NumLoopByVals;
12371 return EmitStructByval(MI, BB);
12372 case ARM::WIN__CHKSTK:
12373 return EmitLowered__chkstk(MI, BB);
12374 case ARM::WIN__DBZCHK:
12375 return EmitLowered__dbzchk(MI, BB);
12376 }
12377}
12378
12379/// Attaches vregs to MEMCPY that it will use as scratch registers
12380/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12381/// instead of as a custom inserter because we need the use list from the SDNode.
12382static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12383 MachineInstr &MI, const SDNode *Node) {
12384 bool isThumb1 = Subtarget->isThumb1Only();
12385
12386 DebugLoc DL = MI.getDebugLoc();
12387 MachineFunction *MF = MI.getParent()->getParent();
12389 MachineInstrBuilder MIB(*MF, MI);
12390
12391 // If the new dst/src is unused mark it as dead.
12392 if (!Node->hasAnyUseOfValue(0)) {
12393 MI.getOperand(0).setIsDead(true);
12394 }
12395 if (!Node->hasAnyUseOfValue(1)) {
12396 MI.getOperand(1).setIsDead(true);
12397 }
12398
12399 // The MEMCPY both defines and kills the scratch registers.
12400 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12401 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12402 : &ARM::GPRRegClass);
12404 }
12405}
12406
12408 SDNode *Node) const {
12409 if (MI.getOpcode() == ARM::MEMCPY) {
12410 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12411 return;
12412 }
12413
12414 const MCInstrDesc *MCID = &MI.getDesc();
12415 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12416 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12417 // operand is still set to noreg. If needed, set the optional operand's
12418 // register to CPSR, and remove the redundant implicit def.
12419 //
12420 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12421
12422 // Rename pseudo opcodes.
12423 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12424 unsigned ccOutIdx;
12425 if (NewOpc) {
12426 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12427 MCID = &TII->get(NewOpc);
12428
12429 assert(MCID->getNumOperands() ==
12430 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12431 && "converted opcode should be the same except for cc_out"
12432 " (and, on Thumb1, pred)");
12433
12434 MI.setDesc(*MCID);
12435
12436 // Add the optional cc_out operand
12437 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12438
12439 // On Thumb1, move all input operands to the end, then add the predicate
12440 if (Subtarget->isThumb1Only()) {
12441 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12442 MI.addOperand(MI.getOperand(1));
12443 MI.removeOperand(1);
12444 }
12445
12446 // Restore the ties
12447 for (unsigned i = MI.getNumOperands(); i--;) {
12448 const MachineOperand& op = MI.getOperand(i);
12449 if (op.isReg() && op.isUse()) {
12450 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12451 if (DefIdx != -1)
12452 MI.tieOperands(DefIdx, i);
12453 }
12454 }
12455
12457 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12458 ccOutIdx = 1;
12459 } else
12460 ccOutIdx = MCID->getNumOperands() - 1;
12461 } else
12462 ccOutIdx = MCID->getNumOperands() - 1;
12463
12464 // Any ARM instruction that sets the 's' bit should specify an optional
12465 // "cc_out" operand in the last operand position.
12466 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12467 assert(!NewOpc && "Optional cc_out operand required");
12468 return;
12469 }
12470 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12471 // since we already have an optional CPSR def.
12472 bool definesCPSR = false;
12473 bool deadCPSR = false;
12474 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12475 ++i) {
12476 const MachineOperand &MO = MI.getOperand(i);
12477 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12478 definesCPSR = true;
12479 if (MO.isDead())
12480 deadCPSR = true;
12481 MI.removeOperand(i);
12482 break;
12483 }
12484 }
12485 if (!definesCPSR) {
12486 assert(!NewOpc && "Optional cc_out operand required");
12487 return;
12488 }
12489 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12490 if (deadCPSR) {
12491 assert(!MI.getOperand(ccOutIdx).getReg() &&
12492 "expect uninitialized optional cc_out operand");
12493 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12494 if (!Subtarget->isThumb1Only())
12495 return;
12496 }
12497
12498 // If this instruction was defined with an optional CPSR def and its dag node
12499 // had a live implicit CPSR def, then activate the optional CPSR def.
12500 MachineOperand &MO = MI.getOperand(ccOutIdx);
12501 MO.setReg(ARM::CPSR);
12502 MO.setIsDef(true);
12503}
12504
12505//===----------------------------------------------------------------------===//
12506// ARM Optimization Hooks
12507//===----------------------------------------------------------------------===//
12508
12509// Helper function that checks if N is a null or all ones constant.
12510static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12512}
12513
12514// Return true if N is conditionally 0 or all ones.
12515// Detects these expressions where cc is an i1 value:
12516//
12517// (select cc 0, y) [AllOnes=0]
12518// (select cc y, 0) [AllOnes=0]
12519// (zext cc) [AllOnes=0]
12520// (sext cc) [AllOnes=0/1]
12521// (select cc -1, y) [AllOnes=1]
12522// (select cc y, -1) [AllOnes=1]
12523//
12524// Invert is set when N is the null/all ones constant when CC is false.
12525// OtherOp is set to the alternative value of N.
12527 SDValue &CC, bool &Invert,
12528 SDValue &OtherOp,
12529 SelectionDAG &DAG) {
12530 switch (N->getOpcode()) {
12531 default: return false;
12532 case ISD::SELECT: {
12533 CC = N->getOperand(0);
12534 SDValue N1 = N->getOperand(1);
12535 SDValue N2 = N->getOperand(2);
12536 if (isZeroOrAllOnes(N1, AllOnes)) {
12537 Invert = false;
12538 OtherOp = N2;
12539 return true;
12540 }
12541 if (isZeroOrAllOnes(N2, AllOnes)) {
12542 Invert = true;
12543 OtherOp = N1;
12544 return true;
12545 }
12546 return false;
12547 }
12548 case ISD::ZERO_EXTEND:
12549 // (zext cc) can never be the all ones value.
12550 if (AllOnes)
12551 return false;
12552 [[fallthrough]];
12553 case ISD::SIGN_EXTEND: {
12554 SDLoc dl(N);
12555 EVT VT = N->getValueType(0);
12556 CC = N->getOperand(0);
12557 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12558 return false;
12559 Invert = !AllOnes;
12560 if (AllOnes)
12561 // When looking for an AllOnes constant, N is an sext, and the 'other'
12562 // value is 0.
12563 OtherOp = DAG.getConstant(0, dl, VT);
12564 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12565 // When looking for a 0 constant, N can be zext or sext.
12566 OtherOp = DAG.getConstant(1, dl, VT);
12567 else
12568 OtherOp = DAG.getAllOnesConstant(dl, VT);
12569 return true;
12570 }
12571 }
12572}
12573
12574// Combine a constant select operand into its use:
12575//
12576// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12577// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12578// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12579// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12580// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12581//
12582// The transform is rejected if the select doesn't have a constant operand that
12583// is null, or all ones when AllOnes is set.
12584//
12585// Also recognize sext/zext from i1:
12586//
12587// (add (zext cc), x) -> (select cc (add x, 1), x)
12588// (add (sext cc), x) -> (select cc (add x, -1), x)
12589//
12590// These transformations eventually create predicated instructions.
12591//
12592// @param N The node to transform.
12593// @param Slct The N operand that is a select.
12594// @param OtherOp The other N operand (x above).
12595// @param DCI Context.
12596// @param AllOnes Require the select constant to be all ones instead of null.
12597// @returns The new node, or SDValue() on failure.
12598static
12601 bool AllOnes = false) {
12602 SelectionDAG &DAG = DCI.DAG;
12603 EVT VT = N->getValueType(0);
12604 SDValue NonConstantVal;
12605 SDValue CCOp;
12606 bool SwapSelectOps;
12607 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12608 NonConstantVal, DAG))
12609 return SDValue();
12610
12611 // Slct is now know to be the desired identity constant when CC is true.
12612 SDValue TrueVal = OtherOp;
12613 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12614 OtherOp, NonConstantVal);
12615 // Unless SwapSelectOps says CC should be false.
12616 if (SwapSelectOps)
12617 std::swap(TrueVal, FalseVal);
12618
12619 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12620 CCOp, TrueVal, FalseVal);
12621}
12622
12623// Attempt combineSelectAndUse on each operand of a commutative operator N.
12624static
12627 SDValue N0 = N->getOperand(0);
12628 SDValue N1 = N->getOperand(1);
12629 if (N0.getNode()->hasOneUse())
12630 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12631 return Result;
12632 if (N1.getNode()->hasOneUse())
12633 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12634 return Result;
12635 return SDValue();
12636}
12637
12639 // VUZP shuffle node.
12640 if (N->getOpcode() == ARMISD::VUZP)
12641 return true;
12642
12643 // "VUZP" on i32 is an alias for VTRN.
12644 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12645 return true;
12646
12647 return false;
12648}
12649
12652 const ARMSubtarget *Subtarget) {
12653 // Look for ADD(VUZP.0, VUZP.1).
12654 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12655 N0 == N1)
12656 return SDValue();
12657
12658 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12659 if (!N->getValueType(0).is64BitVector())
12660 return SDValue();
12661
12662 // Generate vpadd.
12663 SelectionDAG &DAG = DCI.DAG;
12664 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12665 SDLoc dl(N);
12666 SDNode *Unzip = N0.getNode();
12667 EVT VT = N->getValueType(0);
12668
12670 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12671 TLI.getPointerTy(DAG.getDataLayout())));
12672 Ops.push_back(Unzip->getOperand(0));
12673 Ops.push_back(Unzip->getOperand(1));
12674
12675 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12676}
12677
12680 const ARMSubtarget *Subtarget) {
12681 // Check for two extended operands.
12682 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12683 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12684 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12685 N1.getOpcode() == ISD::ZERO_EXTEND))
12686 return SDValue();
12687
12688 SDValue N00 = N0.getOperand(0);
12689 SDValue N10 = N1.getOperand(0);
12690
12691 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12692 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12693 N00 == N10)
12694 return SDValue();
12695
12696 // We only recognize Q register paddl here; this can't be reached until
12697 // after type legalization.
12698 if (!N00.getValueType().is64BitVector() ||
12700 return SDValue();
12701
12702 // Generate vpaddl.
12703 SelectionDAG &DAG = DCI.DAG;
12704 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12705 SDLoc dl(N);
12706 EVT VT = N->getValueType(0);
12707
12709 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12710 unsigned Opcode;
12711 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12712 Opcode = Intrinsic::arm_neon_vpaddls;
12713 else
12714 Opcode = Intrinsic::arm_neon_vpaddlu;
12715 Ops.push_back(DAG.getConstant(Opcode, dl,
12716 TLI.getPointerTy(DAG.getDataLayout())));
12717 EVT ElemTy = N00.getValueType().getVectorElementType();
12718 unsigned NumElts = VT.getVectorNumElements();
12719 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12720 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12721 N00.getOperand(0), N00.getOperand(1));
12722 Ops.push_back(Concat);
12723
12724 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12725}
12726
12727// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12728// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12729// much easier to match.
12730static SDValue
12733 const ARMSubtarget *Subtarget) {
12734 // Only perform optimization if after legalize, and if NEON is available. We
12735 // also expected both operands to be BUILD_VECTORs.
12736 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12737 || N0.getOpcode() != ISD::BUILD_VECTOR
12738 || N1.getOpcode() != ISD::BUILD_VECTOR)
12739 return SDValue();
12740
12741 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12742 EVT VT = N->getValueType(0);
12743 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12744 return SDValue();
12745
12746 // Check that the vector operands are of the right form.
12747 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12748 // operands, where N is the size of the formed vector.
12749 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12750 // index such that we have a pair wise add pattern.
12751
12752 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12754 return SDValue();
12755 SDValue Vec = N0->getOperand(0)->getOperand(0);
12756 SDNode *V = Vec.getNode();
12757 unsigned nextIndex = 0;
12758
12759 // For each operands to the ADD which are BUILD_VECTORs,
12760 // check to see if each of their operands are an EXTRACT_VECTOR with
12761 // the same vector and appropriate index.
12762 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12765
12766 SDValue ExtVec0 = N0->getOperand(i);
12767 SDValue ExtVec1 = N1->getOperand(i);
12768
12769 // First operand is the vector, verify its the same.
12770 if (V != ExtVec0->getOperand(0).getNode() ||
12771 V != ExtVec1->getOperand(0).getNode())
12772 return SDValue();
12773
12774 // Second is the constant, verify its correct.
12777
12778 // For the constant, we want to see all the even or all the odd.
12779 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12780 || C1->getZExtValue() != nextIndex+1)
12781 return SDValue();
12782
12783 // Increment index.
12784 nextIndex+=2;
12785 } else
12786 return SDValue();
12787 }
12788
12789 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12790 // we're using the entire input vector, otherwise there's a size/legality
12791 // mismatch somewhere.
12792 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12794 return SDValue();
12795
12796 // Create VPADDL node.
12797 SelectionDAG &DAG = DCI.DAG;
12798 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12799
12800 SDLoc dl(N);
12801
12802 // Build operand list.
12804 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12805 TLI.getPointerTy(DAG.getDataLayout())));
12806
12807 // Input is the vector.
12808 Ops.push_back(Vec);
12809
12810 // Get widened type and narrowed type.
12811 MVT widenType;
12812 unsigned numElem = VT.getVectorNumElements();
12813
12814 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12815 switch (inputLaneType.getSimpleVT().SimpleTy) {
12816 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12817 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12818 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12819 default:
12820 llvm_unreachable("Invalid vector element type for padd optimization.");
12821 }
12822
12823 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12824 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12825 return DAG.getNode(ExtOp, dl, VT, tmp);
12826}
12827
12829 if (V->getOpcode() == ISD::UMUL_LOHI ||
12830 V->getOpcode() == ISD::SMUL_LOHI)
12831 return V;
12832 return SDValue();
12833}
12834
12835static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12837 const ARMSubtarget *Subtarget) {
12838 if (!Subtarget->hasBaseDSP())
12839 return SDValue();
12840
12841 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12842 // accumulates the product into a 64-bit value. The 16-bit values will
12843 // be sign extended somehow or SRA'd into 32-bit values
12844 // (addc (adde (mul 16bit, 16bit), lo), hi)
12845 SDValue Mul = AddcNode->getOperand(0);
12846 SDValue Lo = AddcNode->getOperand(1);
12847 if (Mul.getOpcode() != ISD::MUL) {
12848 Lo = AddcNode->getOperand(0);
12849 Mul = AddcNode->getOperand(1);
12850 if (Mul.getOpcode() != ISD::MUL)
12851 return SDValue();
12852 }
12853
12854 SDValue SRA = AddeNode->getOperand(0);
12855 SDValue Hi = AddeNode->getOperand(1);
12856 if (SRA.getOpcode() != ISD::SRA) {
12857 SRA = AddeNode->getOperand(1);
12858 Hi = AddeNode->getOperand(0);
12859 if (SRA.getOpcode() != ISD::SRA)
12860 return SDValue();
12861 }
12862 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12863 if (Const->getZExtValue() != 31)
12864 return SDValue();
12865 } else
12866 return SDValue();
12867
12868 if (SRA.getOperand(0) != Mul)
12869 return SDValue();
12870
12871 SelectionDAG &DAG = DCI.DAG;
12872 SDLoc dl(AddcNode);
12873 unsigned Opcode = 0;
12874 SDValue Op0;
12875 SDValue Op1;
12876
12877 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12878 Opcode = ARMISD::SMLALBB;
12879 Op0 = Mul.getOperand(0);
12880 Op1 = Mul.getOperand(1);
12881 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12882 Opcode = ARMISD::SMLALBT;
12883 Op0 = Mul.getOperand(0);
12884 Op1 = Mul.getOperand(1).getOperand(0);
12885 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12886 Opcode = ARMISD::SMLALTB;
12887 Op0 = Mul.getOperand(0).getOperand(0);
12888 Op1 = Mul.getOperand(1);
12889 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12890 Opcode = ARMISD::SMLALTT;
12891 Op0 = Mul->getOperand(0).getOperand(0);
12892 Op1 = Mul->getOperand(1).getOperand(0);
12893 }
12894
12895 if (!Op0 || !Op1)
12896 return SDValue();
12897
12898 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12899 Op0, Op1, Lo, Hi);
12900 // Replace the ADDs' nodes uses by the MLA node's values.
12901 SDValue HiMLALResult(SMLAL.getNode(), 1);
12902 SDValue LoMLALResult(SMLAL.getNode(), 0);
12903
12904 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12905 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12906
12907 // Return original node to notify the driver to stop replacing.
12908 SDValue resNode(AddcNode, 0);
12909 return resNode;
12910}
12911
12914 const ARMSubtarget *Subtarget) {
12915 // Look for multiply add opportunities.
12916 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12917 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12918 // a glue link from the first add to the second add.
12919 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12920 // a S/UMLAL instruction.
12921 // UMUL_LOHI
12922 // / :lo \ :hi
12923 // V \ [no multiline comment]
12924 // loAdd -> ADDC |
12925 // \ :carry /
12926 // V V
12927 // ADDE <- hiAdd
12928 //
12929 // In the special case where only the higher part of a signed result is used
12930 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12931 // a constant with the exact value of 0x80000000, we recognize we are dealing
12932 // with a "rounded multiply and add" (or subtract) and transform it into
12933 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12934
12935 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12936 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12937 "Expect an ADDE or SUBE");
12938
12939 assert(AddeSubeNode->getNumOperands() == 3 &&
12940 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12941 "ADDE node has the wrong inputs");
12942
12943 // Check that we are chained to the right ADDC or SUBC node.
12944 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12945 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12946 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12947 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12948 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12949 return SDValue();
12950
12951 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12952 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12953
12954 // Check if the two operands are from the same mul_lohi node.
12955 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12956 return SDValue();
12957
12958 assert(AddcSubcNode->getNumValues() == 2 &&
12959 AddcSubcNode->getValueType(0) == MVT::i32 &&
12960 "Expect ADDC with two result values. First: i32");
12961
12962 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12963 // maybe a SMLAL which multiplies two 16-bit values.
12964 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12965 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12966 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12967 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12968 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12969 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12970
12971 // Check for the triangle shape.
12972 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12973 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12974
12975 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12976 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12977 return SDValue();
12978
12979 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12980 bool IsLeftOperandMUL = false;
12981 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12982 if (MULOp == SDValue())
12983 MULOp = findMUL_LOHI(AddeSubeOp1);
12984 else
12985 IsLeftOperandMUL = true;
12986 if (MULOp == SDValue())
12987 return SDValue();
12988
12989 // Figure out the right opcode.
12990 unsigned Opc = MULOp->getOpcode();
12991 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12992
12993 // Figure out the high and low input values to the MLAL node.
12994 SDValue *HiAddSub = nullptr;
12995 SDValue *LoMul = nullptr;
12996 SDValue *LowAddSub = nullptr;
12997
12998 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12999 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
13000 return SDValue();
13001
13002 if (IsLeftOperandMUL)
13003 HiAddSub = &AddeSubeOp1;
13004 else
13005 HiAddSub = &AddeSubeOp0;
13006
13007 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
13008 // whose low result is fed to the ADDC/SUBC we are checking.
13009
13010 if (AddcSubcOp0 == MULOp.getValue(0)) {
13011 LoMul = &AddcSubcOp0;
13012 LowAddSub = &AddcSubcOp1;
13013 }
13014 if (AddcSubcOp1 == MULOp.getValue(0)) {
13015 LoMul = &AddcSubcOp1;
13016 LowAddSub = &AddcSubcOp0;
13017 }
13018
13019 if (!LoMul)
13020 return SDValue();
13021
13022 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
13023 // the replacement below will create a cycle.
13024 if (AddcSubcNode == HiAddSub->getNode() ||
13025 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
13026 return SDValue();
13027
13028 // Create the merged node.
13029 SelectionDAG &DAG = DCI.DAG;
13030
13031 // Start building operand list.
13033 Ops.push_back(LoMul->getOperand(0));
13034 Ops.push_back(LoMul->getOperand(1));
13035
13036 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
13037 // the case, we must be doing signed multiplication and only use the higher
13038 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
13039 // addition or subtraction with the value of 0x800000.
13040 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
13041 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
13042 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
13043 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
13044 0x80000000) {
13045 Ops.push_back(*HiAddSub);
13046 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
13047 FinalOpc = ARMISD::SMMLSR;
13048 } else {
13049 FinalOpc = ARMISD::SMMLAR;
13050 }
13051 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
13052 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
13053
13054 return SDValue(AddeSubeNode, 0);
13055 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
13056 // SMMLS is generated during instruction selection and the rest of this
13057 // function can not handle the case where AddcSubcNode is a SUBC.
13058 return SDValue();
13059
13060 // Finish building the operand list for {U/S}MLAL
13061 Ops.push_back(*LowAddSub);
13062 Ops.push_back(*HiAddSub);
13063
13064 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
13065 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13066
13067 // Replace the ADDs' nodes uses by the MLA node's values.
13068 SDValue HiMLALResult(MLALNode.getNode(), 1);
13069 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
13070
13071 SDValue LoMLALResult(MLALNode.getNode(), 0);
13072 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
13073
13074 // Return original node to notify the driver to stop replacing.
13075 return SDValue(AddeSubeNode, 0);
13076}
13077
13080 const ARMSubtarget *Subtarget) {
13081 // UMAAL is similar to UMLAL except that it adds two unsigned values.
13082 // While trying to combine for the other MLAL nodes, first search for the
13083 // chance to use UMAAL. Check if Addc uses a node which has already
13084 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
13085 // as the addend, and it's handled in PerformUMLALCombine.
13086
13087 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13088 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13089
13090 // Check that we have a glued ADDC node.
13091 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
13092 if (AddcNode->getOpcode() != ARMISD::ADDC)
13093 return SDValue();
13094
13095 // Find the converted UMAAL or quit if it doesn't exist.
13096 SDNode *UmlalNode = nullptr;
13097 SDValue AddHi;
13098 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
13099 UmlalNode = AddcNode->getOperand(0).getNode();
13100 AddHi = AddcNode->getOperand(1);
13101 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
13102 UmlalNode = AddcNode->getOperand(1).getNode();
13103 AddHi = AddcNode->getOperand(0);
13104 } else {
13105 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13106 }
13107
13108 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
13109 // the ADDC as well as Zero.
13110 if (!isNullConstant(UmlalNode->getOperand(3)))
13111 return SDValue();
13112
13113 if ((isNullConstant(AddeNode->getOperand(0)) &&
13114 AddeNode->getOperand(1).getNode() == UmlalNode) ||
13115 (AddeNode->getOperand(0).getNode() == UmlalNode &&
13116 isNullConstant(AddeNode->getOperand(1)))) {
13117 SelectionDAG &DAG = DCI.DAG;
13118 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
13119 UmlalNode->getOperand(2), AddHi };
13120 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
13121 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13122
13123 // Replace the ADDs' nodes uses by the UMAAL node's values.
13124 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
13125 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
13126
13127 // Return original node to notify the driver to stop replacing.
13128 return SDValue(AddeNode, 0);
13129 }
13130 return SDValue();
13131}
13132
13134 const ARMSubtarget *Subtarget) {
13135 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13136 return SDValue();
13137
13138 // Check that we have a pair of ADDC and ADDE as operands.
13139 // Both addends of the ADDE must be zero.
13140 SDNode* AddcNode = N->getOperand(2).getNode();
13141 SDNode* AddeNode = N->getOperand(3).getNode();
13142 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
13143 (AddeNode->getOpcode() == ARMISD::ADDE) &&
13144 isNullConstant(AddeNode->getOperand(0)) &&
13145 isNullConstant(AddeNode->getOperand(1)) &&
13146 (AddeNode->getOperand(2).getNode() == AddcNode))
13147 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
13148 DAG.getVTList(MVT::i32, MVT::i32),
13149 {N->getOperand(0), N->getOperand(1),
13150 AddcNode->getOperand(0), AddcNode->getOperand(1)});
13151 else
13152 return SDValue();
13153}
13154
13157 const ARMSubtarget *Subtarget) {
13158 SelectionDAG &DAG(DCI.DAG);
13159
13160 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
13161 // (SUBC (ADDE 0, 0, C), 1) -> C
13162 SDValue LHS = N->getOperand(0);
13163 SDValue RHS = N->getOperand(1);
13164 if (LHS->getOpcode() == ARMISD::ADDE &&
13165 isNullConstant(LHS->getOperand(0)) &&
13166 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
13167 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
13168 }
13169 }
13170
13171 if (Subtarget->isThumb1Only()) {
13172 SDValue RHS = N->getOperand(1);
13174 int32_t imm = C->getSExtValue();
13175 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
13176 SDLoc DL(N);
13177 RHS = DAG.getConstant(-imm, DL, MVT::i32);
13178 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
13179 : ARMISD::ADDC;
13180 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
13181 }
13182 }
13183 }
13184
13185 return SDValue();
13186}
13187
13190 const ARMSubtarget *Subtarget) {
13191 if (Subtarget->isThumb1Only()) {
13192 SelectionDAG &DAG = DCI.DAG;
13193 SDValue RHS = N->getOperand(1);
13195 int64_t imm = C->getSExtValue();
13196 if (imm < 0) {
13197 SDLoc DL(N);
13198
13199 // The with-carry-in form matches bitwise not instead of the negation.
13200 // Effectively, the inverse interpretation of the carry flag already
13201 // accounts for part of the negation.
13202 RHS = DAG.getConstant(~imm, DL, MVT::i32);
13203
13204 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
13205 : ARMISD::ADDE;
13206 return DAG.getNode(Opcode, DL, N->getVTList(),
13207 N->getOperand(0), RHS, N->getOperand(2));
13208 }
13209 }
13210 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
13211 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
13212 }
13213 return SDValue();
13214}
13215
13218 const ARMSubtarget *Subtarget) {
13219 if (!Subtarget->hasMVEIntegerOps())
13220 return SDValue();
13221
13222 SDLoc dl(N);
13223 SDValue SetCC;
13224 SDValue LHS;
13225 SDValue RHS;
13227 SDValue TrueVal;
13228 SDValue FalseVal;
13229
13230 if (N->getOpcode() == ISD::SELECT &&
13231 N->getOperand(0)->getOpcode() == ISD::SETCC) {
13232 SetCC = N->getOperand(0);
13233 LHS = SetCC->getOperand(0);
13234 RHS = SetCC->getOperand(1);
13235 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
13236 TrueVal = N->getOperand(1);
13237 FalseVal = N->getOperand(2);
13238 } else if (N->getOpcode() == ISD::SELECT_CC) {
13239 LHS = N->getOperand(0);
13240 RHS = N->getOperand(1);
13241 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
13242 TrueVal = N->getOperand(2);
13243 FalseVal = N->getOperand(3);
13244 } else {
13245 return SDValue();
13246 }
13247
13248 unsigned int Opcode = 0;
13249 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
13250 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
13251 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
13252 Opcode = ARMISD::VMINVu;
13253 if (CC == ISD::SETUGT)
13254 std::swap(TrueVal, FalseVal);
13255 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
13256 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
13257 (CC == ISD::SETLT || CC == ISD::SETGT)) {
13258 Opcode = ARMISD::VMINVs;
13259 if (CC == ISD::SETGT)
13260 std::swap(TrueVal, FalseVal);
13261 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
13262 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13263 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13264 Opcode = ARMISD::VMAXVu;
13265 if (CC == ISD::SETULT)
13266 std::swap(TrueVal, FalseVal);
13267 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13268 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13269 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13270 Opcode = ARMISD::VMAXVs;
13271 if (CC == ISD::SETLT)
13272 std::swap(TrueVal, FalseVal);
13273 } else
13274 return SDValue();
13275
13276 // Normalise to the right hand side being the vector reduction
13277 switch (TrueVal->getOpcode()) {
13278 case ISD::VECREDUCE_UMIN:
13279 case ISD::VECREDUCE_SMIN:
13280 case ISD::VECREDUCE_UMAX:
13281 case ISD::VECREDUCE_SMAX:
13282 std::swap(LHS, RHS);
13283 std::swap(TrueVal, FalseVal);
13284 break;
13285 }
13286
13287 EVT VectorType = FalseVal->getOperand(0).getValueType();
13288
13289 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13290 VectorType != MVT::v4i32)
13291 return SDValue();
13292
13293 EVT VectorScalarType = VectorType.getVectorElementType();
13294
13295 // The values being selected must also be the ones being compared
13296 if (TrueVal != LHS || FalseVal != RHS)
13297 return SDValue();
13298
13299 EVT LeftType = LHS->getValueType(0);
13300 EVT RightType = RHS->getValueType(0);
13301
13302 // The types must match the reduced type too
13303 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13304 return SDValue();
13305
13306 // Legalise the scalar to an i32
13307 if (VectorScalarType != MVT::i32)
13308 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13309
13310 // Generate the reduction as an i32 for legalisation purposes
13311 auto Reduction =
13312 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13313
13314 // The result isn't actually an i32 so truncate it back to its original type
13315 if (VectorScalarType != MVT::i32)
13316 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13317
13318 return Reduction;
13319}
13320
13321// A special combine for the vqdmulh family of instructions. This is one of the
13322// potential set of patterns that could patch this instruction. The base pattern
13323// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13324// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13325// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13326// the max is unnecessary.
13328 EVT VT = N->getValueType(0);
13329 SDValue Shft;
13330 ConstantSDNode *Clamp;
13331
13332 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13333 return SDValue();
13334
13335 if (N->getOpcode() == ISD::SMIN) {
13336 Shft = N->getOperand(0);
13337 Clamp = isConstOrConstSplat(N->getOperand(1));
13338 } else if (N->getOpcode() == ISD::VSELECT) {
13339 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13340 SDValue Cmp = N->getOperand(0);
13341 if (Cmp.getOpcode() != ISD::SETCC ||
13342 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13343 Cmp.getOperand(0) != N->getOperand(1) ||
13344 Cmp.getOperand(1) != N->getOperand(2))
13345 return SDValue();
13346 Shft = N->getOperand(1);
13347 Clamp = isConstOrConstSplat(N->getOperand(2));
13348 } else
13349 return SDValue();
13350
13351 if (!Clamp)
13352 return SDValue();
13353
13354 MVT ScalarType;
13355 int ShftAmt = 0;
13356 switch (Clamp->getSExtValue()) {
13357 case (1 << 7) - 1:
13358 ScalarType = MVT::i8;
13359 ShftAmt = 7;
13360 break;
13361 case (1 << 15) - 1:
13362 ScalarType = MVT::i16;
13363 ShftAmt = 15;
13364 break;
13365 case (1ULL << 31) - 1:
13366 ScalarType = MVT::i32;
13367 ShftAmt = 31;
13368 break;
13369 default:
13370 return SDValue();
13371 }
13372
13373 if (Shft.getOpcode() != ISD::SRA)
13374 return SDValue();
13376 if (!N1 || N1->getSExtValue() != ShftAmt)
13377 return SDValue();
13378
13379 SDValue Mul = Shft.getOperand(0);
13380 if (Mul.getOpcode() != ISD::MUL)
13381 return SDValue();
13382
13383 SDValue Ext0 = Mul.getOperand(0);
13384 SDValue Ext1 = Mul.getOperand(1);
13385 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13386 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13387 return SDValue();
13388 EVT VecVT = Ext0.getOperand(0).getValueType();
13389 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13390 return SDValue();
13391 if (Ext1.getOperand(0).getValueType() != VecVT ||
13392 VecVT.getScalarType() != ScalarType ||
13393 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13394 return SDValue();
13395
13396 SDLoc DL(Mul);
13397 unsigned LegalLanes = 128 / (ShftAmt + 1);
13398 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13399 // For types smaller than legal vectors extend to be legal and only use needed
13400 // lanes.
13401 if (VecVT.getSizeInBits() < 128) {
13402 EVT ExtVecVT =
13404 VecVT.getVectorNumElements());
13405 SDValue Inp0 =
13406 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13407 SDValue Inp1 =
13408 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13409 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13410 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13411 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13412 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13413 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13414 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13415 }
13416
13417 // For larger types, split into legal sized chunks.
13418 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13419 unsigned NumParts = VecVT.getSizeInBits() / 128;
13421 for (unsigned I = 0; I < NumParts; ++I) {
13422 SDValue Inp0 =
13423 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13424 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13425 SDValue Inp1 =
13426 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13427 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13428 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13429 Parts.push_back(VQDMULH);
13430 }
13431 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13432 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13433}
13434
13437 const ARMSubtarget *Subtarget) {
13438 if (!Subtarget->hasMVEIntegerOps())
13439 return SDValue();
13440
13441 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13442 return V;
13443
13444 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13445 //
13446 // We need to re-implement this optimization here as the implementation in the
13447 // Target-Independent DAGCombiner does not handle the kind of constant we make
13448 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13449 // good reason, allowing truncation there would break other targets).
13450 //
13451 // Currently, this is only done for MVE, as it's the only target that benefits
13452 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13453 if (N->getOperand(0).getOpcode() != ISD::XOR)
13454 return SDValue();
13455 SDValue XOR = N->getOperand(0);
13456
13457 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13458 // It is important to check with truncation allowed as the BUILD_VECTORs we
13459 // generate in those situations will truncate their operands.
13460 ConstantSDNode *Const =
13461 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13462 /*AllowTruncation*/ true);
13463 if (!Const || !Const->isOne())
13464 return SDValue();
13465
13466 // Rewrite into vselect(cond, rhs, lhs).
13467 SDValue Cond = XOR->getOperand(0);
13468 SDValue LHS = N->getOperand(1);
13469 SDValue RHS = N->getOperand(2);
13470 EVT Type = N->getValueType(0);
13471 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13472}
13473
13474// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13477 const ARMSubtarget *Subtarget) {
13478 SDValue Op0 = N->getOperand(0);
13479 SDValue Op1 = N->getOperand(1);
13480 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13481 EVT VT = N->getValueType(0);
13482
13483 if (!Subtarget->hasMVEIntegerOps() ||
13485 return SDValue();
13486
13487 if (CC == ISD::SETUGE) {
13488 std::swap(Op0, Op1);
13489 CC = ISD::SETULT;
13490 }
13491
13492 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13494 return SDValue();
13495
13496 // Check first operand is BuildVector of 0,1,2,...
13497 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13498 if (!Op0.getOperand(I).isUndef() &&
13500 Op0.getConstantOperandVal(I) == I))
13501 return SDValue();
13502 }
13503
13504 // The second is a Splat of Op1S
13505 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13506 if (!Op1S)
13507 return SDValue();
13508
13509 unsigned Opc;
13510 switch (VT.getVectorNumElements()) {
13511 case 2:
13512 Opc = Intrinsic::arm_mve_vctp64;
13513 break;
13514 case 4:
13515 Opc = Intrinsic::arm_mve_vctp32;
13516 break;
13517 case 8:
13518 Opc = Intrinsic::arm_mve_vctp16;
13519 break;
13520 case 16:
13521 Opc = Intrinsic::arm_mve_vctp8;
13522 break;
13523 default:
13524 return SDValue();
13525 }
13526
13527 SDLoc DL(N);
13528 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13529 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13530 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13531}
13532
13533/// PerformADDECombine - Target-specific dag combine transform from
13534/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13535/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13538 const ARMSubtarget *Subtarget) {
13539 // Only ARM and Thumb2 support UMLAL/SMLAL.
13540 if (Subtarget->isThumb1Only())
13541 return PerformAddeSubeCombine(N, DCI, Subtarget);
13542
13543 // Only perform the checks after legalize when the pattern is available.
13544 if (DCI.isBeforeLegalize()) return SDValue();
13545
13546 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13547}
13548
13549/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13550/// operands N0 and N1. This is a helper for PerformADDCombine that is
13551/// called with the default operands, and if that fails, with commuted
13552/// operands.
13555 const ARMSubtarget *Subtarget){
13556 // Attempt to create vpadd for this add.
13557 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13558 return Result;
13559
13560 // Attempt to create vpaddl for this add.
13561 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13562 return Result;
13563 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13564 Subtarget))
13565 return Result;
13566
13567 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13568 if (N0.getNode()->hasOneUse())
13569 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13570 return Result;
13571 return SDValue();
13572}
13573
13575 EVT VT = N->getValueType(0);
13576 SDValue N0 = N->getOperand(0);
13577 SDValue N1 = N->getOperand(1);
13578 SDLoc dl(N);
13579
13580 auto IsVecReduce = [](SDValue Op) {
13581 switch (Op.getOpcode()) {
13582 case ISD::VECREDUCE_ADD:
13583 case ARMISD::VADDVs:
13584 case ARMISD::VADDVu:
13585 case ARMISD::VMLAVs:
13586 case ARMISD::VMLAVu:
13587 return true;
13588 }
13589 return false;
13590 };
13591
13592 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13593 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13594 // add(add(X, vecreduce(Y)), vecreduce(Z))
13595 // to make better use of vaddva style instructions.
13596 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13597 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13598 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13599 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13600 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13601 }
13602 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13603 // add(add(add(A, C), reduce(B)), reduce(D))
13604 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13605 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13606 unsigned N0RedOp = 0;
13607 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13608 N0RedOp = 1;
13609 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13610 return SDValue();
13611 }
13612
13613 unsigned N1RedOp = 0;
13614 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13615 N1RedOp = 1;
13616 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13617 return SDValue();
13618
13619 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13620 N1.getOperand(1 - N1RedOp));
13621 SDValue Add1 =
13622 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13623 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13624 }
13625 return SDValue();
13626 };
13627 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13628 return R;
13629 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13630 return R;
13631
13632 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13633 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13634 // by ascending load offsets. This can help cores prefetch if the order of
13635 // loads is more predictable.
13636 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13637 // Check if two reductions are known to load data where one is before/after
13638 // another. Return negative if N0 loads data before N1, positive if N1 is
13639 // before N0 and 0 otherwise if nothing is known.
13640 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13641 // Look through to the first operand of a MUL, for the VMLA case.
13642 // Currently only looks at the first operand, in the hope they are equal.
13643 if (N0.getOpcode() == ISD::MUL)
13644 N0 = N0.getOperand(0);
13645 if (N1.getOpcode() == ISD::MUL)
13646 N1 = N1.getOperand(0);
13647
13648 // Return true if the two operands are loads to the same object and the
13649 // offset of the first is known to be less than the offset of the second.
13650 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13651 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13652 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13653 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13654 Load1->isIndexed())
13655 return 0;
13656
13657 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13658 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13659
13660 if (!BaseLocDecomp0.getBase() ||
13661 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13662 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13663 return 0;
13664 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13665 return -1;
13666 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13667 return 1;
13668 return 0;
13669 };
13670
13671 SDValue X;
13672 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13673 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13674 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13675 N0.getOperand(1).getOperand(0));
13676 if (IsBefore < 0) {
13677 X = N0.getOperand(0);
13678 N0 = N0.getOperand(1);
13679 } else if (IsBefore > 0) {
13680 X = N0.getOperand(1);
13681 N0 = N0.getOperand(0);
13682 } else
13683 return SDValue();
13684 } else if (IsVecReduce(N0.getOperand(0))) {
13685 X = N0.getOperand(1);
13686 N0 = N0.getOperand(0);
13687 } else if (IsVecReduce(N0.getOperand(1))) {
13688 X = N0.getOperand(0);
13689 N0 = N0.getOperand(1);
13690 } else
13691 return SDValue();
13692 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13693 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13694 // Note this is backward to how you would expect. We create
13695 // add(reduce(load + 16), reduce(load + 0)) so that the
13696 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13697 // the X as VADDV(load + 0)
13698 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13699 } else
13700 return SDValue();
13701
13702 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13703 return SDValue();
13704
13705 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13706 return SDValue();
13707
13708 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13709 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13710 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13711 };
13712 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13713 return R;
13714 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13715 return R;
13716 return SDValue();
13717}
13718
13720 const ARMSubtarget *Subtarget) {
13721 if (!Subtarget->hasMVEIntegerOps())
13722 return SDValue();
13723
13725 return R;
13726
13727 EVT VT = N->getValueType(0);
13728 SDValue N0 = N->getOperand(0);
13729 SDValue N1 = N->getOperand(1);
13730 SDLoc dl(N);
13731
13732 if (VT != MVT::i64)
13733 return SDValue();
13734
13735 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13736 // will look like:
13737 // t1: i32,i32 = ARMISD::VADDLVs x
13738 // t2: i64 = build_pair t1, t1:1
13739 // t3: i64 = add t2, y
13740 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13741 // the add to be simplified separately.
13742 // We also need to check for sext / zext and commutitive adds.
13743 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13744 SDValue NB) {
13745 if (NB->getOpcode() != ISD::BUILD_PAIR)
13746 return SDValue();
13747 SDValue VecRed = NB->getOperand(0);
13748 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13749 VecRed.getResNo() != 0 ||
13750 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13751 return SDValue();
13752
13753 if (VecRed->getOpcode() == OpcodeA) {
13754 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13755 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13756 VecRed.getOperand(0), VecRed.getOperand(1));
13757 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13758 }
13759
13761 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13762
13763 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13764 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13765 Ops.push_back(VecRed->getOperand(I));
13766 SDValue Red =
13767 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13768 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13769 SDValue(Red.getNode(), 1));
13770 };
13771
13772 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13773 return M;
13774 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13775 return M;
13776 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13777 return M;
13778 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13779 return M;
13780 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13781 return M;
13782 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13783 return M;
13784 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13785 return M;
13786 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13787 return M;
13788 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13789 return M;
13790 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13791 return M;
13792 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13793 return M;
13794 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13795 return M;
13796 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13797 return M;
13798 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13799 return M;
13800 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13801 return M;
13802 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13803 return M;
13804 return SDValue();
13805}
13806
13807bool
13809 CombineLevel Level) const {
13810 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13811 N->getOpcode() == ISD::SRL) &&
13812 "Expected shift op");
13813
13814 if (Level == BeforeLegalizeTypes)
13815 return true;
13816
13817 if (N->getOpcode() != ISD::SHL)
13818 return true;
13819
13820 if (Subtarget->isThumb1Only()) {
13821 // Avoid making expensive immediates by commuting shifts. (This logic
13822 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13823 // for free.)
13824 if (N->getOpcode() != ISD::SHL)
13825 return true;
13826 SDValue N1 = N->getOperand(0);
13827 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13828 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13829 return true;
13830 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13831 if (Const->getAPIntValue().ult(256))
13832 return false;
13833 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13834 Const->getAPIntValue().sgt(-256))
13835 return false;
13836 }
13837 return true;
13838 }
13839
13840 // Turn off commute-with-shift transform after legalization, so it doesn't
13841 // conflict with PerformSHLSimplify. (We could try to detect when
13842 // PerformSHLSimplify would trigger more precisely, but it isn't
13843 // really necessary.)
13844 return false;
13845}
13846
13848 const SDNode *N) const {
13849 assert(N->getOpcode() == ISD::XOR &&
13850 (N->getOperand(0).getOpcode() == ISD::SHL ||
13851 N->getOperand(0).getOpcode() == ISD::SRL) &&
13852 "Expected XOR(SHIFT) pattern");
13853
13854 // Only commute if the entire NOT mask is a hidden shifted mask.
13855 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13856 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13857 if (XorC && ShiftC) {
13858 unsigned MaskIdx, MaskLen;
13859 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13860 unsigned ShiftAmt = ShiftC->getZExtValue();
13861 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13862 if (N->getOperand(0).getOpcode() == ISD::SHL)
13863 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13864 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13865 }
13866 }
13867
13868 return false;
13869}
13870
13872 const SDNode *N, CombineLevel Level) const {
13873 assert(((N->getOpcode() == ISD::SHL &&
13874 N->getOperand(0).getOpcode() == ISD::SRL) ||
13875 (N->getOpcode() == ISD::SRL &&
13876 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13877 "Expected shift-shift mask");
13878
13879 if (!Subtarget->isThumb1Only())
13880 return true;
13881
13882 if (Level == BeforeLegalizeTypes)
13883 return true;
13884
13885 return false;
13886}
13887
13889 EVT VT) const {
13890 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT);
13891}
13892
13894 if (!Subtarget->hasNEON()) {
13895 if (Subtarget->isThumb1Only())
13896 return VT.getScalarSizeInBits() <= 32;
13897 return true;
13898 }
13899 return VT.isScalarInteger();
13900}
13901
13903 EVT VT) const {
13904 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13905 return false;
13906
13907 switch (FPVT.getSimpleVT().SimpleTy) {
13908 case MVT::f16:
13909 return Subtarget->hasVFP2Base();
13910 case MVT::f32:
13911 return Subtarget->hasVFP2Base();
13912 case MVT::f64:
13913 return Subtarget->hasFP64();
13914 case MVT::v4f32:
13915 case MVT::v8f16:
13916 return Subtarget->hasMVEFloatOps();
13917 default:
13918 return false;
13919 }
13920}
13921
13924 const ARMSubtarget *ST) {
13925 // Allow the generic combiner to identify potential bswaps.
13926 if (DCI.isBeforeLegalize())
13927 return SDValue();
13928
13929 // DAG combiner will fold:
13930 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13931 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13932 // Other code patterns that can be also be modified have the following form:
13933 // b + ((a << 1) | 510)
13934 // b + ((a << 1) & 510)
13935 // b + ((a << 1) ^ 510)
13936 // b + ((a << 1) + 510)
13937
13938 // Many instructions can perform the shift for free, but it requires both
13939 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13940 // instruction will needed. So, unfold back to the original pattern if:
13941 // - if c1 and c2 are small enough that they don't require mov imms.
13942 // - the user(s) of the node can perform an shl
13943
13944 // No shifted operands for 16-bit instructions.
13945 if (ST->isThumb() && ST->isThumb1Only())
13946 return SDValue();
13947
13948 // Check that all the users could perform the shl themselves.
13949 for (auto *U : N->uses()) {
13950 switch(U->getOpcode()) {
13951 default:
13952 return SDValue();
13953 case ISD::SUB:
13954 case ISD::ADD:
13955 case ISD::AND:
13956 case ISD::OR:
13957 case ISD::XOR:
13958 case ISD::SETCC:
13959 case ARMISD::CMP:
13960 // Check that the user isn't already using a constant because there
13961 // aren't any instructions that support an immediate operand and a
13962 // shifted operand.
13963 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13964 isa<ConstantSDNode>(U->getOperand(1)))
13965 return SDValue();
13966
13967 // Check that it's not already using a shift.
13968 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13969 U->getOperand(1).getOpcode() == ISD::SHL)
13970 return SDValue();
13971 break;
13972 }
13973 }
13974
13975 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13976 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13977 return SDValue();
13978
13979 if (N->getOperand(0).getOpcode() != ISD::SHL)
13980 return SDValue();
13981
13982 SDValue SHL = N->getOperand(0);
13983
13984 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13985 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13986 if (!C1ShlC2 || !C2)
13987 return SDValue();
13988
13989 APInt C2Int = C2->getAPIntValue();
13990 APInt C1Int = C1ShlC2->getAPIntValue();
13991 unsigned C2Width = C2Int.getBitWidth();
13992 if (C2Int.uge(C2Width))
13993 return SDValue();
13994 uint64_t C2Value = C2Int.getZExtValue();
13995
13996 // Check that performing a lshr will not lose any information.
13997 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
13998 if ((C1Int & Mask) != C1Int)
13999 return SDValue();
14000
14001 // Shift the first constant.
14002 C1Int.lshrInPlace(C2Int);
14003
14004 // The immediates are encoded as an 8-bit value that can be rotated.
14005 auto LargeImm = [](const APInt &Imm) {
14006 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
14007 return Imm.getBitWidth() - Zeros > 8;
14008 };
14009
14010 if (LargeImm(C1Int) || LargeImm(C2Int))
14011 return SDValue();
14012
14013 SelectionDAG &DAG = DCI.DAG;
14014 SDLoc dl(N);
14015 SDValue X = SHL.getOperand(0);
14016 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
14017 DAG.getConstant(C1Int, dl, MVT::i32));
14018 // Shift left to compensate for the lshr of C1Int.
14019 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
14020
14021 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
14022 SHL.dump(); N->dump());
14023 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
14024 return Res;
14025}
14026
14027
14028/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
14029///
14032 const ARMSubtarget *Subtarget) {
14033 SDValue N0 = N->getOperand(0);
14034 SDValue N1 = N->getOperand(1);
14035
14036 // Only works one way, because it needs an immediate operand.
14037 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14038 return Result;
14039
14040 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
14041 return Result;
14042
14043 // First try with the default operand order.
14044 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
14045 return Result;
14046
14047 // If that didn't work, try again with the operands commuted.
14048 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
14049}
14050
14051// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
14052// providing -X is as cheap as X (currently, just a constant).
14054 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
14055 return SDValue();
14056 SDValue CSINC = N->getOperand(1);
14057 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
14058 return SDValue();
14059
14060 ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0));
14061 if (!X)
14062 return SDValue();
14063
14064 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
14065 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
14066 CSINC.getOperand(0)),
14067 CSINC.getOperand(1), CSINC.getOperand(2),
14068 CSINC.getOperand(3));
14069}
14070
14071/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
14072///
14075 const ARMSubtarget *Subtarget) {
14076 SDValue N0 = N->getOperand(0);
14077 SDValue N1 = N->getOperand(1);
14078
14079 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
14080 if (N1.getNode()->hasOneUse())
14081 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
14082 return Result;
14083
14084 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
14085 return R;
14086
14087 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
14088 return SDValue();
14089
14090 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
14091 // so that we can readily pattern match more mve instructions which can use
14092 // a scalar operand.
14093 SDValue VDup = N->getOperand(1);
14094 if (VDup->getOpcode() != ARMISD::VDUP)
14095 return SDValue();
14096
14097 SDValue VMov = N->getOperand(0);
14098 if (VMov->getOpcode() == ISD::BITCAST)
14099 VMov = VMov->getOperand(0);
14100
14101 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
14102 return SDValue();
14103
14104 SDLoc dl(N);
14105 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
14106 DCI.DAG.getConstant(0, dl, MVT::i32),
14107 VDup->getOperand(0));
14108 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
14109}
14110
14111/// PerformVMULCombine
14112/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
14113/// special multiplier accumulator forwarding.
14114/// vmul d3, d0, d2
14115/// vmla d3, d1, d2
14116/// is faster than
14117/// vadd d3, d0, d1
14118/// vmul d3, d3, d2
14119// However, for (A + B) * (A + B),
14120// vadd d2, d0, d1
14121// vmul d3, d0, d2
14122// vmla d3, d1, d2
14123// is slower than
14124// vadd d2, d0, d1
14125// vmul d3, d2, d2
14128 const ARMSubtarget *Subtarget) {
14129 if (!Subtarget->hasVMLxForwarding())
14130 return SDValue();
14131
14132 SelectionDAG &DAG = DCI.DAG;
14133 SDValue N0 = N->getOperand(0);
14134 SDValue N1 = N->getOperand(1);
14135 unsigned Opcode = N0.getOpcode();
14136 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14137 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
14138 Opcode = N1.getOpcode();
14139 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14140 Opcode != ISD::FADD && Opcode != ISD::FSUB)
14141 return SDValue();
14142 std::swap(N0, N1);
14143 }
14144
14145 if (N0 == N1)
14146 return SDValue();
14147
14148 EVT VT = N->getValueType(0);
14149 SDLoc DL(N);
14150 SDValue N00 = N0->getOperand(0);
14151 SDValue N01 = N0->getOperand(1);
14152 return DAG.getNode(Opcode, DL, VT,
14153 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
14154 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
14155}
14156
14158 const ARMSubtarget *Subtarget) {
14159 EVT VT = N->getValueType(0);
14160 if (VT != MVT::v2i64)
14161 return SDValue();
14162
14163 SDValue N0 = N->getOperand(0);
14164 SDValue N1 = N->getOperand(1);
14165
14166 auto IsSignExt = [&](SDValue Op) {
14167 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
14168 return SDValue();
14169 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
14170 if (VT.getScalarSizeInBits() == 32)
14171 return Op->getOperand(0);
14172 return SDValue();
14173 };
14174 auto IsZeroExt = [&](SDValue Op) {
14175 // Zero extends are a little more awkward. At the point we are matching
14176 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
14177 // That might be before of after a bitcast depending on how the and is
14178 // placed. Because this has to look through bitcasts, it is currently only
14179 // supported on LE.
14180 if (!Subtarget->isLittle())
14181 return SDValue();
14182
14183 SDValue And = Op;
14184 if (And->getOpcode() == ISD::BITCAST)
14185 And = And->getOperand(0);
14186 if (And->getOpcode() != ISD::AND)
14187 return SDValue();
14188 SDValue Mask = And->getOperand(1);
14189 if (Mask->getOpcode() == ISD::BITCAST)
14190 Mask = Mask->getOperand(0);
14191
14192 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
14193 Mask.getValueType() != MVT::v4i32)
14194 return SDValue();
14195 if (isAllOnesConstant(Mask->getOperand(0)) &&
14196 isNullConstant(Mask->getOperand(1)) &&
14197 isAllOnesConstant(Mask->getOperand(2)) &&
14198 isNullConstant(Mask->getOperand(3)))
14199 return And->getOperand(0);
14200 return SDValue();
14201 };
14202
14203 SDLoc dl(N);
14204 if (SDValue Op0 = IsSignExt(N0)) {
14205 if (SDValue Op1 = IsSignExt(N1)) {
14206 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14207 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14208 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
14209 }
14210 }
14211 if (SDValue Op0 = IsZeroExt(N0)) {
14212 if (SDValue Op1 = IsZeroExt(N1)) {
14213 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14214 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14215 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14216 }
14217 }
14218
14219 return SDValue();
14220}
14221
14224 const ARMSubtarget *Subtarget) {
14225 SelectionDAG &DAG = DCI.DAG;
14226
14227 EVT VT = N->getValueType(0);
14228 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14229 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14230
14231 if (Subtarget->isThumb1Only())
14232 return SDValue();
14233
14234 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14235 return SDValue();
14236
14237 if (VT.is64BitVector() || VT.is128BitVector())
14238 return PerformVMULCombine(N, DCI, Subtarget);
14239 if (VT != MVT::i32)
14240 return SDValue();
14241
14242 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14243 if (!C)
14244 return SDValue();
14245
14246 int64_t MulAmt = C->getSExtValue();
14247 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14248
14249 ShiftAmt = ShiftAmt & (32 - 1);
14250 SDValue V = N->getOperand(0);
14251 SDLoc DL(N);
14252
14253 SDValue Res;
14254 MulAmt >>= ShiftAmt;
14255
14256 if (MulAmt >= 0) {
14257 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14258 // (mul x, 2^N + 1) => (add (shl x, N), x)
14259 Res = DAG.getNode(ISD::ADD, DL, VT,
14260 V,
14261 DAG.getNode(ISD::SHL, DL, VT,
14262 V,
14263 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14264 MVT::i32)));
14265 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14266 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14267 Res = DAG.getNode(ISD::SUB, DL, VT,
14268 DAG.getNode(ISD::SHL, DL, VT,
14269 V,
14270 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14271 MVT::i32)),
14272 V);
14273 } else
14274 return SDValue();
14275 } else {
14276 uint64_t MulAmtAbs = -MulAmt;
14277 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14278 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14279 Res = DAG.getNode(ISD::SUB, DL, VT,
14280 V,
14281 DAG.getNode(ISD::SHL, DL, VT,
14282 V,
14283 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14284 MVT::i32)));
14285 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14286 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14287 Res = DAG.getNode(ISD::ADD, DL, VT,
14288 V,
14289 DAG.getNode(ISD::SHL, DL, VT,
14290 V,
14291 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14292 MVT::i32)));
14293 Res = DAG.getNode(ISD::SUB, DL, VT,
14294 DAG.getConstant(0, DL, MVT::i32), Res);
14295 } else
14296 return SDValue();
14297 }
14298
14299 if (ShiftAmt != 0)
14300 Res = DAG.getNode(ISD::SHL, DL, VT,
14301 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14302
14303 // Do not add new nodes to DAG combiner worklist.
14304 DCI.CombineTo(N, Res, false);
14305 return SDValue();
14306}
14307
14310 const ARMSubtarget *Subtarget) {
14311 // Allow DAGCombine to pattern-match before we touch the canonical form.
14312 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14313 return SDValue();
14314
14315 if (N->getValueType(0) != MVT::i32)
14316 return SDValue();
14317
14318 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14319 if (!N1C)
14320 return SDValue();
14321
14322 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14323 // Don't transform uxtb/uxth.
14324 if (C1 == 255 || C1 == 65535)
14325 return SDValue();
14326
14327 SDNode *N0 = N->getOperand(0).getNode();
14328 if (!N0->hasOneUse())
14329 return SDValue();
14330
14331 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14332 return SDValue();
14333
14334 bool LeftShift = N0->getOpcode() == ISD::SHL;
14335
14337 if (!N01C)
14338 return SDValue();
14339
14340 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14341 if (!C2 || C2 >= 32)
14342 return SDValue();
14343
14344 // Clear irrelevant bits in the mask.
14345 if (LeftShift)
14346 C1 &= (-1U << C2);
14347 else
14348 C1 &= (-1U >> C2);
14349
14350 SelectionDAG &DAG = DCI.DAG;
14351 SDLoc DL(N);
14352
14353 // We have a pattern of the form "(and (shl x, c2) c1)" or
14354 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14355 // transform to a pair of shifts, to save materializing c1.
14356
14357 // First pattern: right shift, then mask off leading bits.
14358 // FIXME: Use demanded bits?
14359 if (!LeftShift && isMask_32(C1)) {
14360 uint32_t C3 = llvm::countl_zero(C1);
14361 if (C2 < C3) {
14362 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14363 DAG.getConstant(C3 - C2, DL, MVT::i32));
14364 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14365 DAG.getConstant(C3, DL, MVT::i32));
14366 }
14367 }
14368
14369 // First pattern, reversed: left shift, then mask off trailing bits.
14370 if (LeftShift && isMask_32(~C1)) {
14371 uint32_t C3 = llvm::countr_zero(C1);
14372 if (C2 < C3) {
14373 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14374 DAG.getConstant(C3 - C2, DL, MVT::i32));
14375 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14376 DAG.getConstant(C3, DL, MVT::i32));
14377 }
14378 }
14379
14380 // Second pattern: left shift, then mask off leading bits.
14381 // FIXME: Use demanded bits?
14382 if (LeftShift && isShiftedMask_32(C1)) {
14383 uint32_t Trailing = llvm::countr_zero(C1);
14384 uint32_t C3 = llvm::countl_zero(C1);
14385 if (Trailing == C2 && C2 + C3 < 32) {
14386 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14387 DAG.getConstant(C2 + C3, DL, MVT::i32));
14388 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14389 DAG.getConstant(C3, DL, MVT::i32));
14390 }
14391 }
14392
14393 // Second pattern, reversed: right shift, then mask off trailing bits.
14394 // FIXME: Handle other patterns of known/demanded bits.
14395 if (!LeftShift && isShiftedMask_32(C1)) {
14396 uint32_t Leading = llvm::countl_zero(C1);
14397 uint32_t C3 = llvm::countr_zero(C1);
14398 if (Leading == C2 && C2 + C3 < 32) {
14399 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14400 DAG.getConstant(C2 + C3, DL, MVT::i32));
14401 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14402 DAG.getConstant(C3, DL, MVT::i32));
14403 }
14404 }
14405
14406 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14407 // if "c1 >> c2" is a cheaper immediate than "c1"
14408 if (LeftShift &&
14409 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14410
14411 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14412 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14413 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14414 DAG.getConstant(C2, DL, MVT::i32));
14415 }
14416
14417 return SDValue();
14418}
14419
14422 const ARMSubtarget *Subtarget) {
14423 // Attempt to use immediate-form VBIC
14424 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14425 SDLoc dl(N);
14426 EVT VT = N->getValueType(0);
14427 SelectionDAG &DAG = DCI.DAG;
14428
14429 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14430 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14431 return SDValue();
14432
14433 APInt SplatBits, SplatUndef;
14434 unsigned SplatBitSize;
14435 bool HasAnyUndefs;
14436 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14437 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14438 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14439 SplatBitSize == 64) {
14440 EVT VbicVT;
14441 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14442 SplatUndef.getZExtValue(), SplatBitSize,
14443 DAG, dl, VbicVT, VT, OtherModImm);
14444 if (Val.getNode()) {
14445 SDValue Input =
14446 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
14447 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14448 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
14449 }
14450 }
14451 }
14452
14453 if (!Subtarget->isThumb1Only()) {
14454 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14455 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14456 return Result;
14457
14458 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14459 return Result;
14460 }
14461
14462 if (Subtarget->isThumb1Only())
14463 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14464 return Result;
14465
14466 return SDValue();
14467}
14468
14469// Try combining OR nodes to SMULWB, SMULWT.
14472 const ARMSubtarget *Subtarget) {
14473 if (!Subtarget->hasV6Ops() ||
14474 (Subtarget->isThumb() &&
14475 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14476 return SDValue();
14477
14478 SDValue SRL = OR->getOperand(0);
14479 SDValue SHL = OR->getOperand(1);
14480
14481 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14482 SRL = OR->getOperand(1);
14483 SHL = OR->getOperand(0);
14484 }
14485 if (!isSRL16(SRL) || !isSHL16(SHL))
14486 return SDValue();
14487
14488 // The first operands to the shifts need to be the two results from the
14489 // same smul_lohi node.
14490 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14491 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14492 return SDValue();
14493
14494 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14495 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14496 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14497 return SDValue();
14498
14499 // Now we have:
14500 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14501 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14502 // For SMUWB the 16-bit value will signed extended somehow.
14503 // For SMULWT only the SRA is required.
14504 // Check both sides of SMUL_LOHI
14505 SDValue OpS16 = SMULLOHI->getOperand(0);
14506 SDValue OpS32 = SMULLOHI->getOperand(1);
14507
14508 SelectionDAG &DAG = DCI.DAG;
14509 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14510 OpS16 = OpS32;
14511 OpS32 = SMULLOHI->getOperand(0);
14512 }
14513
14514 SDLoc dl(OR);
14515 unsigned Opcode = 0;
14516 if (isS16(OpS16, DAG))
14517 Opcode = ARMISD::SMULWB;
14518 else if (isSRA16(OpS16)) {
14519 Opcode = ARMISD::SMULWT;
14520 OpS16 = OpS16->getOperand(0);
14521 }
14522 else
14523 return SDValue();
14524
14525 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14526 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14527 return SDValue(OR, 0);
14528}
14529
14532 const ARMSubtarget *Subtarget) {
14533 // BFI is only available on V6T2+
14534 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14535 return SDValue();
14536
14537 EVT VT = N->getValueType(0);
14538 SDValue N0 = N->getOperand(0);
14539 SDValue N1 = N->getOperand(1);
14540 SelectionDAG &DAG = DCI.DAG;
14541 SDLoc DL(N);
14542 // 1) or (and A, mask), val => ARMbfi A, val, mask
14543 // iff (val & mask) == val
14544 //
14545 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14546 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14547 // && mask == ~mask2
14548 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14549 // && ~mask == mask2
14550 // (i.e., copy a bitfield value into another bitfield of the same width)
14551
14552 if (VT != MVT::i32)
14553 return SDValue();
14554
14555 SDValue N00 = N0.getOperand(0);
14556
14557 // The value and the mask need to be constants so we can verify this is
14558 // actually a bitfield set. If the mask is 0xffff, we can do better
14559 // via a movt instruction, so don't use BFI in that case.
14560 SDValue MaskOp = N0.getOperand(1);
14562 if (!MaskC)
14563 return SDValue();
14564 unsigned Mask = MaskC->getZExtValue();
14565 if (Mask == 0xffff)
14566 return SDValue();
14567 SDValue Res;
14568 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14570 if (N1C) {
14571 unsigned Val = N1C->getZExtValue();
14572 if ((Val & ~Mask) != Val)
14573 return SDValue();
14574
14575 if (ARM::isBitFieldInvertedMask(Mask)) {
14576 Val >>= llvm::countr_zero(~Mask);
14577
14578 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14579 DAG.getConstant(Val, DL, MVT::i32),
14580 DAG.getConstant(Mask, DL, MVT::i32));
14581
14582 DCI.CombineTo(N, Res, false);
14583 // Return value from the original node to inform the combiner than N is
14584 // now dead.
14585 return SDValue(N, 0);
14586 }
14587 } else if (N1.getOpcode() == ISD::AND) {
14588 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14590 if (!N11C)
14591 return SDValue();
14592 unsigned Mask2 = N11C->getZExtValue();
14593
14594 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14595 // as is to match.
14596 if (ARM::isBitFieldInvertedMask(Mask) &&
14597 (Mask == ~Mask2)) {
14598 // The pack halfword instruction works better for masks that fit it,
14599 // so use that when it's available.
14600 if (Subtarget->hasDSP() &&
14601 (Mask == 0xffff || Mask == 0xffff0000))
14602 return SDValue();
14603 // 2a
14604 unsigned amt = llvm::countr_zero(Mask2);
14605 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14606 DAG.getConstant(amt, DL, MVT::i32));
14607 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14608 DAG.getConstant(Mask, DL, MVT::i32));
14609 DCI.CombineTo(N, Res, false);
14610 // Return value from the original node to inform the combiner than N is
14611 // now dead.
14612 return SDValue(N, 0);
14613 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14614 (~Mask == Mask2)) {
14615 // The pack halfword instruction works better for masks that fit it,
14616 // so use that when it's available.
14617 if (Subtarget->hasDSP() &&
14618 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14619 return SDValue();
14620 // 2b
14621 unsigned lsb = llvm::countr_zero(Mask);
14622 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14623 DAG.getConstant(lsb, DL, MVT::i32));
14624 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14625 DAG.getConstant(Mask2, DL, MVT::i32));
14626 DCI.CombineTo(N, Res, false);
14627 // Return value from the original node to inform the combiner than N is
14628 // now dead.
14629 return SDValue(N, 0);
14630 }
14631 }
14632
14633 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14634 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14636 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14637 // where lsb(mask) == #shamt and masked bits of B are known zero.
14638 SDValue ShAmt = N00.getOperand(1);
14639 unsigned ShAmtC = ShAmt->getAsZExtVal();
14640 unsigned LSB = llvm::countr_zero(Mask);
14641 if (ShAmtC != LSB)
14642 return SDValue();
14643
14644 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14645 DAG.getConstant(~Mask, DL, MVT::i32));
14646
14647 DCI.CombineTo(N, Res, false);
14648 // Return value from the original node to inform the combiner than N is
14649 // now dead.
14650 return SDValue(N, 0);
14651 }
14652
14653 return SDValue();
14654}
14655
14656static bool isValidMVECond(unsigned CC, bool IsFloat) {
14657 switch (CC) {
14658 case ARMCC::EQ:
14659 case ARMCC::NE:
14660 case ARMCC::LE:
14661 case ARMCC::GT:
14662 case ARMCC::GE:
14663 case ARMCC::LT:
14664 return true;
14665 case ARMCC::HS:
14666 case ARMCC::HI:
14667 return !IsFloat;
14668 default:
14669 return false;
14670 };
14671}
14672
14674 if (N->getOpcode() == ARMISD::VCMP)
14675 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14676 else if (N->getOpcode() == ARMISD::VCMPZ)
14677 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14678 else
14679 llvm_unreachable("Not a VCMP/VCMPZ!");
14680}
14681
14684 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14685}
14686
14688 const ARMSubtarget *Subtarget) {
14689 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14690 // together with predicates
14691 EVT VT = N->getValueType(0);
14692 SDLoc DL(N);
14693 SDValue N0 = N->getOperand(0);
14694 SDValue N1 = N->getOperand(1);
14695
14696 auto IsFreelyInvertable = [&](SDValue V) {
14697 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14698 return CanInvertMVEVCMP(V);
14699 return false;
14700 };
14701
14702 // At least one operand must be freely invertable.
14703 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14704 return SDValue();
14705
14706 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14707 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14708 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14709 return DAG.getLogicalNOT(DL, And, VT);
14710}
14711
14712/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14715 const ARMSubtarget *Subtarget) {
14716 // Attempt to use immediate-form VORR
14717 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14718 SDLoc dl(N);
14719 EVT VT = N->getValueType(0);
14720 SelectionDAG &DAG = DCI.DAG;
14721
14722 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14723 return SDValue();
14724
14725 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14726 VT == MVT::v8i1 || VT == MVT::v16i1))
14727 return PerformORCombine_i1(N, DAG, Subtarget);
14728
14729 APInt SplatBits, SplatUndef;
14730 unsigned SplatBitSize;
14731 bool HasAnyUndefs;
14732 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14733 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14734 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14735 SplatBitSize == 64) {
14736 EVT VorrVT;
14737 SDValue Val =
14738 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14739 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14740 if (Val.getNode()) {
14741 SDValue Input =
14742 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
14743 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14744 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
14745 }
14746 }
14747 }
14748
14749 if (!Subtarget->isThumb1Only()) {
14750 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14751 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14752 return Result;
14753 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14754 return Result;
14755 }
14756
14757 SDValue N0 = N->getOperand(0);
14758 SDValue N1 = N->getOperand(1);
14759
14760 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14761 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14763
14764 // The code below optimizes (or (and X, Y), Z).
14765 // The AND operand needs to have a single user to make these optimizations
14766 // profitable.
14767 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14768 return SDValue();
14769
14770 APInt SplatUndef;
14771 unsigned SplatBitSize;
14772 bool HasAnyUndefs;
14773
14774 APInt SplatBits0, SplatBits1;
14777 // Ensure that the second operand of both ands are constants
14778 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14779 HasAnyUndefs) && !HasAnyUndefs) {
14780 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14781 HasAnyUndefs) && !HasAnyUndefs) {
14782 // Ensure that the bit width of the constants are the same and that
14783 // the splat arguments are logical inverses as per the pattern we
14784 // are trying to simplify.
14785 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14786 SplatBits0 == ~SplatBits1) {
14787 // Canonicalize the vector type to make instruction selection
14788 // simpler.
14789 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14790 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14791 N0->getOperand(1),
14792 N0->getOperand(0),
14793 N1->getOperand(0));
14794 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);
14795 }
14796 }
14797 }
14798 }
14799
14800 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14801 // reasonable.
14802 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14803 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14804 return Res;
14805 }
14806
14807 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14808 return Result;
14809
14810 return SDValue();
14811}
14812
14815 const ARMSubtarget *Subtarget) {
14816 EVT VT = N->getValueType(0);
14817 SelectionDAG &DAG = DCI.DAG;
14818
14819 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14820 return SDValue();
14821
14822 if (!Subtarget->isThumb1Only()) {
14823 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14824 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14825 return Result;
14826
14827 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14828 return Result;
14829 }
14830
14831 if (Subtarget->hasMVEIntegerOps()) {
14832 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14833 SDValue N0 = N->getOperand(0);
14834 SDValue N1 = N->getOperand(1);
14835 const TargetLowering *TLI = Subtarget->getTargetLowering();
14836 if (TLI->isConstTrueVal(N1) &&
14837 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14838 if (CanInvertMVEVCMP(N0)) {
14839 SDLoc DL(N0);
14841
14843 Ops.push_back(N0->getOperand(0));
14844 if (N0->getOpcode() == ARMISD::VCMP)
14845 Ops.push_back(N0->getOperand(1));
14846 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14847 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14848 }
14849 }
14850 }
14851
14852 return SDValue();
14853}
14854
14855// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14856// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14857// their position in "to" (Rd).
14858static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14859 assert(N->getOpcode() == ARMISD::BFI);
14860
14861 SDValue From = N->getOperand(1);
14862 ToMask = ~N->getConstantOperandAPInt(2);
14863 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14864
14865 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14866 // #C in the base of the SHR.
14867 if (From->getOpcode() == ISD::SRL &&
14868 isa<ConstantSDNode>(From->getOperand(1))) {
14870 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14871 FromMask <<= Shift.getLimitedValue(31);
14872 From = From->getOperand(0);
14873 }
14874
14875 return From;
14876}
14877
14878// If A and B contain one contiguous set of bits, does A | B == A . B?
14879//
14880// Neither A nor B must be zero.
14881static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14882 unsigned LastActiveBitInA = A.countr_zero();
14883 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14884 return LastActiveBitInA - 1 == FirstActiveBitInB;
14885}
14886
14888 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14889 APInt ToMask, FromMask;
14890 SDValue From = ParseBFI(N, ToMask, FromMask);
14891 SDValue To = N->getOperand(0);
14892
14893 SDValue V = To;
14894 if (V.getOpcode() != ARMISD::BFI)
14895 return SDValue();
14896
14897 APInt NewToMask, NewFromMask;
14898 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14899 if (NewFrom != From)
14900 return SDValue();
14901
14902 // Do the written bits conflict with any we've seen so far?
14903 if ((NewToMask & ToMask).getBoolValue())
14904 // Conflicting bits.
14905 return SDValue();
14906
14907 // Are the new bits contiguous when combined with the old bits?
14908 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14909 BitsProperlyConcatenate(FromMask, NewFromMask))
14910 return V;
14911 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14912 BitsProperlyConcatenate(NewFromMask, FromMask))
14913 return V;
14914
14915 return SDValue();
14916}
14917
14919 SDValue N0 = N->getOperand(0);
14920 SDValue N1 = N->getOperand(1);
14921
14922 if (N1.getOpcode() == ISD::AND) {
14923 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14924 // the bits being cleared by the AND are not demanded by the BFI.
14926 if (!N11C)
14927 return SDValue();
14928 unsigned InvMask = N->getConstantOperandVal(2);
14929 unsigned LSB = llvm::countr_zero(~InvMask);
14930 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
14931 assert(Width <
14932 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14933 "undefined behavior");
14934 unsigned Mask = (1u << Width) - 1;
14935 unsigned Mask2 = N11C->getZExtValue();
14936 if ((Mask & (~Mask2)) == 0)
14937 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14938 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14939 return SDValue();
14940 }
14941
14942 // Look for another BFI to combine with.
14943 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14944 // We've found a BFI.
14945 APInt ToMask1, FromMask1;
14946 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
14947
14948 APInt ToMask2, FromMask2;
14949 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
14950 assert(From1 == From2);
14951 (void)From2;
14952
14953 // Create a new BFI, combining the two together.
14954 APInt NewFromMask = FromMask1 | FromMask2;
14955 APInt NewToMask = ToMask1 | ToMask2;
14956
14957 EVT VT = N->getValueType(0);
14958 SDLoc dl(N);
14959
14960 if (NewFromMask[0] == 0)
14961 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
14962 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
14963 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14964 DAG.getConstant(~NewToMask, dl, VT));
14965 }
14966
14967 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14968 // that lower bit insertions are performed first, providing that M1 and M2
14969 // do no overlap. This can allow multiple BFI instructions to be combined
14970 // together by the other folds above.
14971 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14972 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14973 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14974
14975 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14976 ToMask1.countl_zero() < ToMask2.countl_zero())
14977 return SDValue();
14978
14979 EVT VT = N->getValueType(0);
14980 SDLoc dl(N);
14981 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
14982 N->getOperand(1), N->getOperand(2));
14983 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
14984 N0.getOperand(2));
14985 }
14986
14987 return SDValue();
14988}
14989
14990// Check that N is CMPZ(CSINC(0, 0, CC, X)),
14991// or CMPZ(CMOV(1, 0, CC, $cpsr, X))
14992// return X if valid.
14994 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
14995 return SDValue();
14996 SDValue CSInc = Cmp->getOperand(0);
14997
14998 // Ignore any `And 1` nodes that may not yet have been removed. We are
14999 // looking for a value that produces 1/0, so these have no effect on the
15000 // code.
15001 while (CSInc.getOpcode() == ISD::AND &&
15002 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
15003 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
15004 CSInc = CSInc.getOperand(0);
15005
15006 if (CSInc.getOpcode() == ARMISD::CSINC &&
15007 isNullConstant(CSInc.getOperand(0)) &&
15008 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15010 return CSInc.getOperand(3);
15011 }
15012 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
15013 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15015 return CSInc.getOperand(4);
15016 }
15017 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
15018 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
15021 return CSInc.getOperand(4);
15022 }
15023 return SDValue();
15024}
15025
15027 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
15028 // t92: glue = ARMISD::CMPZ t74, 0
15029 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
15030 // t96: glue = ARMISD::CMPZ t93, 0
15031 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
15033 if (SDValue C = IsCMPZCSINC(N, Cond))
15034 if (Cond == ARMCC::EQ)
15035 return C;
15036 return SDValue();
15037}
15038
15040 // Fold away an unneccessary CMPZ/CSINC
15041 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
15042 // if C1==EQ -> CSXYZ A, B, C2, D
15043 // if C1==NE -> CSXYZ A, B, NOT(C2), D
15045 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
15046 if (N->getConstantOperandVal(2) == ARMCC::EQ)
15047 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15048 N->getOperand(1),
15049 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
15050 if (N->getConstantOperandVal(2) == ARMCC::NE)
15051 return DAG.getNode(
15052 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15053 N->getOperand(1),
15055 }
15056 return SDValue();
15057}
15058
15059/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
15060/// ARMISD::VMOVRRD.
15063 const ARMSubtarget *Subtarget) {
15064 // vmovrrd(vmovdrr x, y) -> x,y
15065 SDValue InDouble = N->getOperand(0);
15066 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
15067 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
15068
15069 // vmovrrd(load f64) -> (load i32), (load i32)
15070 SDNode *InNode = InDouble.getNode();
15071 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
15072 InNode->getValueType(0) == MVT::f64 &&
15073 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
15074 !cast<LoadSDNode>(InNode)->isVolatile()) {
15075 // TODO: Should this be done for non-FrameIndex operands?
15076 LoadSDNode *LD = cast<LoadSDNode>(InNode);
15077
15078 SelectionDAG &DAG = DCI.DAG;
15079 SDLoc DL(LD);
15080 SDValue BasePtr = LD->getBasePtr();
15081 SDValue NewLD1 =
15082 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
15083 LD->getAlign(), LD->getMemOperand()->getFlags());
15084
15085 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
15086 DAG.getConstant(4, DL, MVT::i32));
15087
15088 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
15089 LD->getPointerInfo().getWithOffset(4),
15090 commonAlignment(LD->getAlign(), 4),
15091 LD->getMemOperand()->getFlags());
15092
15093 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
15094 if (DCI.DAG.getDataLayout().isBigEndian())
15095 std::swap (NewLD1, NewLD2);
15096 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
15097 return Result;
15098 }
15099
15100 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
15101 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
15102 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15103 isa<ConstantSDNode>(InDouble.getOperand(1))) {
15104 SDValue BV = InDouble.getOperand(0);
15105 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
15106 // change lane order under big endian.
15107 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
15108 while (
15109 (BV.getOpcode() == ISD::BITCAST ||
15111 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
15112 BVSwap = BV.getOpcode() == ISD::BITCAST;
15113 BV = BV.getOperand(0);
15114 }
15115 if (BV.getValueType() != MVT::v4i32)
15116 return SDValue();
15117
15118 // Handle buildvectors, pulling out the correct lane depending on
15119 // endianness.
15120 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
15121 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
15122 SDValue Op0 = BV.getOperand(Offset);
15123 SDValue Op1 = BV.getOperand(Offset + 1);
15124 if (!Subtarget->isLittle() && BVSwap)
15125 std::swap(Op0, Op1);
15126
15127 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15128 }
15129
15130 // A chain of insert_vectors, grabbing the correct value of the chain of
15131 // inserts.
15132 SDValue Op0, Op1;
15133 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15134 if (isa<ConstantSDNode>(BV.getOperand(2))) {
15135 if (BV.getConstantOperandVal(2) == Offset && !Op0)
15136 Op0 = BV.getOperand(1);
15137 if (BV.getConstantOperandVal(2) == Offset + 1 && !Op1)
15138 Op1 = BV.getOperand(1);
15139 }
15140 BV = BV.getOperand(0);
15141 }
15142 if (!Subtarget->isLittle() && BVSwap)
15143 std::swap(Op0, Op1);
15144 if (Op0 && Op1)
15145 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15146 }
15147
15148 return SDValue();
15149}
15150
15151/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
15152/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
15154 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
15155 SDValue Op0 = N->getOperand(0);
15156 SDValue Op1 = N->getOperand(1);
15157 if (Op0.getOpcode() == ISD::BITCAST)
15158 Op0 = Op0.getOperand(0);
15159 if (Op1.getOpcode() == ISD::BITCAST)
15160 Op1 = Op1.getOperand(0);
15161 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
15162 Op0.getNode() == Op1.getNode() &&
15163 Op0.getResNo() == 0 && Op1.getResNo() == 1)
15164 return DAG.getNode(ISD::BITCAST, SDLoc(N),
15165 N->getValueType(0), Op0.getOperand(0));
15166 return SDValue();
15167}
15168
15171 SDValue Op0 = N->getOperand(0);
15172
15173 // VMOVhr (VMOVrh (X)) -> X
15174 if (Op0->getOpcode() == ARMISD::VMOVrh)
15175 return Op0->getOperand(0);
15176
15177 // FullFP16: half values are passed in S-registers, and we don't
15178 // need any of the bitcast and moves:
15179 //
15180 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
15181 // t5: i32 = bitcast t2
15182 // t18: f16 = ARMISD::VMOVhr t5
15183 // =>
15184 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
15185 if (Op0->getOpcode() == ISD::BITCAST) {
15186 SDValue Copy = Op0->getOperand(0);
15187 if (Copy.getValueType() == MVT::f32 &&
15188 Copy->getOpcode() == ISD::CopyFromReg) {
15189 bool HasGlue = Copy->getNumOperands() == 3;
15190 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
15191 HasGlue ? Copy->getOperand(2) : SDValue()};
15192 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
15193 SDValue NewCopy =
15195 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
15196 ArrayRef(Ops, HasGlue ? 3 : 2));
15197
15198 // Update Users, Chains, and Potential Glue.
15199 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
15200 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
15201 if (HasGlue)
15202 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
15203 NewCopy.getValue(2));
15204
15205 return NewCopy;
15206 }
15207 }
15208
15209 // fold (VMOVhr (load x)) -> (load (f16*)x)
15210 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
15211 if (LN0->hasOneUse() && LN0->isUnindexed() &&
15212 LN0->getMemoryVT() == MVT::i16) {
15213 SDValue Load =
15214 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15215 LN0->getBasePtr(), LN0->getMemOperand());
15216 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15217 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15218 return Load;
15219 }
15220 }
15221
15222 // Only the bottom 16 bits of the source register are used.
15223 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15224 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15225 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15226 return SDValue(N, 0);
15227
15228 return SDValue();
15229}
15230
15232 SDValue N0 = N->getOperand(0);
15233 EVT VT = N->getValueType(0);
15234
15235 // fold (VMOVrh (fpconst x)) -> const x
15237 APFloat V = C->getValueAPF();
15238 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15239 }
15240
15241 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15242 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15243 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15244
15245 SDValue Load =
15246 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15247 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15248 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15249 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15250 return Load;
15251 }
15252
15253 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15254 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15256 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15257 N0->getOperand(1));
15258
15259 return SDValue();
15260}
15261
15262/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15263/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15264/// i64 vector to have f64 elements, since the value can then be loaded
15265/// directly into a VFP register.
15267 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15268 for (unsigned i = 0; i < NumElts; ++i) {
15269 SDNode *Elt = N->getOperand(i).getNode();
15270 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15271 return true;
15272 }
15273 return false;
15274}
15275
15276/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15277/// ISD::BUILD_VECTOR.
15280 const ARMSubtarget *Subtarget) {
15281 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15282 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15283 // into a pair of GPRs, which is fine when the value is used as a scalar,
15284 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15285 SelectionDAG &DAG = DCI.DAG;
15286 if (N->getNumOperands() == 2)
15287 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15288 return RV;
15289
15290 // Load i64 elements as f64 values so that type legalization does not split
15291 // them up into i32 values.
15292 EVT VT = N->getValueType(0);
15293 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15294 return SDValue();
15295 SDLoc dl(N);
15297 unsigned NumElts = VT.getVectorNumElements();
15298 for (unsigned i = 0; i < NumElts; ++i) {
15299 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15300 Ops.push_back(V);
15301 // Make the DAGCombiner fold the bitcast.
15302 DCI.AddToWorklist(V.getNode());
15303 }
15304 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15305 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15306 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15307}
15308
15309/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15310static SDValue
15312 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15313 // At that time, we may have inserted bitcasts from integer to float.
15314 // If these bitcasts have survived DAGCombine, change the lowering of this
15315 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15316 // force to use floating point types.
15317
15318 // Make sure we can change the type of the vector.
15319 // This is possible iff:
15320 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15321 // 1.1. Vector is used only once.
15322 // 1.2. Use is a bit convert to an integer type.
15323 // 2. The size of its operands are 32-bits (64-bits are not legal).
15324 EVT VT = N->getValueType(0);
15325 EVT EltVT = VT.getVectorElementType();
15326
15327 // Check 1.1. and 2.
15328 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15329 return SDValue();
15330
15331 // By construction, the input type must be float.
15332 assert(EltVT == MVT::f32 && "Unexpected type!");
15333
15334 // Check 1.2.
15335 SDNode *Use = *N->use_begin();
15336 if (Use->getOpcode() != ISD::BITCAST ||
15337 Use->getValueType(0).isFloatingPoint())
15338 return SDValue();
15339
15340 // Check profitability.
15341 // Model is, if more than half of the relevant operands are bitcast from
15342 // i32, turn the build_vector into a sequence of insert_vector_elt.
15343 // Relevant operands are everything that is not statically
15344 // (i.e., at compile time) bitcasted.
15345 unsigned NumOfBitCastedElts = 0;
15346 unsigned NumElts = VT.getVectorNumElements();
15347 unsigned NumOfRelevantElts = NumElts;
15348 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15349 SDValue Elt = N->getOperand(Idx);
15350 if (Elt->getOpcode() == ISD::BITCAST) {
15351 // Assume only bit cast to i32 will go away.
15352 if (Elt->getOperand(0).getValueType() == MVT::i32)
15353 ++NumOfBitCastedElts;
15354 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15355 // Constants are statically casted, thus do not count them as
15356 // relevant operands.
15357 --NumOfRelevantElts;
15358 }
15359
15360 // Check if more than half of the elements require a non-free bitcast.
15361 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15362 return SDValue();
15363
15364 SelectionDAG &DAG = DCI.DAG;
15365 // Create the new vector type.
15366 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15367 // Check if the type is legal.
15368 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15369 if (!TLI.isTypeLegal(VecVT))
15370 return SDValue();
15371
15372 // Combine:
15373 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15374 // => BITCAST INSERT_VECTOR_ELT
15375 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15376 // (BITCAST EN), N.
15377 SDValue Vec = DAG.getUNDEF(VecVT);
15378 SDLoc dl(N);
15379 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15380 SDValue V = N->getOperand(Idx);
15381 if (V.isUndef())
15382 continue;
15383 if (V.getOpcode() == ISD::BITCAST &&
15384 V->getOperand(0).getValueType() == MVT::i32)
15385 // Fold obvious case.
15386 V = V.getOperand(0);
15387 else {
15388 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15389 // Make the DAGCombiner fold the bitcasts.
15390 DCI.AddToWorklist(V.getNode());
15391 }
15392 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15393 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15394 }
15395 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15396 // Make the DAGCombiner fold the bitcasts.
15397 DCI.AddToWorklist(Vec.getNode());
15398 return Vec;
15399}
15400
15401static SDValue
15403 EVT VT = N->getValueType(0);
15404 SDValue Op = N->getOperand(0);
15405 SDLoc dl(N);
15406
15407 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15408 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15409 // If the valuetypes are the same, we can remove the cast entirely.
15410 if (Op->getOperand(0).getValueType() == VT)
15411 return Op->getOperand(0);
15412 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15413 }
15414
15415 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15416 // more VPNOT which might get folded as else predicates.
15417 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15418 SDValue X =
15419 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15421 DCI.DAG.getConstant(65535, dl, MVT::i32));
15422 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15423 }
15424
15425 // Only the bottom 16 bits of the source register are used.
15426 if (Op.getValueType() == MVT::i32) {
15427 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15428 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15429 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15430 return SDValue(N, 0);
15431 }
15432 return SDValue();
15433}
15434
15436 const ARMSubtarget *ST) {
15437 EVT VT = N->getValueType(0);
15438 SDValue Op = N->getOperand(0);
15439 SDLoc dl(N);
15440
15441 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15442 if (ST->isLittle())
15443 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15444
15445 // VT VECTOR_REG_CAST (VT Op) -> Op
15446 if (Op.getValueType() == VT)
15447 return Op;
15448 // VECTOR_REG_CAST undef -> undef
15449 if (Op.isUndef())
15450 return DAG.getUNDEF(VT);
15451
15452 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15453 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15454 // If the valuetypes are the same, we can remove the cast entirely.
15455 if (Op->getOperand(0).getValueType() == VT)
15456 return Op->getOperand(0);
15457 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15458 }
15459
15460 return SDValue();
15461}
15462
15464 const ARMSubtarget *Subtarget) {
15465 if (!Subtarget->hasMVEIntegerOps())
15466 return SDValue();
15467
15468 EVT VT = N->getValueType(0);
15469 SDValue Op0 = N->getOperand(0);
15470 SDValue Op1 = N->getOperand(1);
15472 SDLoc dl(N);
15473
15474 // vcmp X, 0, cc -> vcmpz X, cc
15475 if (isZeroVector(Op1))
15476 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15477
15478 unsigned SwappedCond = getSwappedCondition(Cond);
15479 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15480 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15481 if (isZeroVector(Op0))
15482 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15483 DAG.getConstant(SwappedCond, dl, MVT::i32));
15484 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15485 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15486 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15487 DAG.getConstant(SwappedCond, dl, MVT::i32));
15488 }
15489
15490 return SDValue();
15491}
15492
15493/// PerformInsertEltCombine - Target-specific dag combine xforms for
15494/// ISD::INSERT_VECTOR_ELT.
15497 // Bitcast an i64 load inserted into a vector to f64.
15498 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15499 EVT VT = N->getValueType(0);
15500 SDNode *Elt = N->getOperand(1).getNode();
15501 if (VT.getVectorElementType() != MVT::i64 ||
15502 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15503 return SDValue();
15504
15505 SelectionDAG &DAG = DCI.DAG;
15506 SDLoc dl(N);
15507 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15509 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15510 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15511 // Make the DAGCombiner fold the bitcasts.
15512 DCI.AddToWorklist(Vec.getNode());
15513 DCI.AddToWorklist(V.getNode());
15514 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15515 Vec, V, N->getOperand(2));
15516 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15517}
15518
15519// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15520// directly or bitcast to an integer if the original is a float vector.
15521// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15522// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15523static SDValue
15525 EVT VT = N->getValueType(0);
15526 SDLoc dl(N);
15527
15528 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15529 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15530 return SDValue();
15531
15532 SDValue Ext = SDValue(N, 0);
15533 if (Ext.getOpcode() == ISD::BITCAST &&
15534 Ext.getOperand(0).getValueType() == MVT::f32)
15535 Ext = Ext.getOperand(0);
15536 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15537 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
15538 Ext.getConstantOperandVal(1) % 2 != 0)
15539 return SDValue();
15540 if (Ext->use_size() == 1 &&
15541 (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP ||
15542 Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP))
15543 return SDValue();
15544
15545 SDValue Op0 = Ext.getOperand(0);
15546 EVT VecVT = Op0.getValueType();
15547 unsigned ResNo = Op0.getResNo();
15548 unsigned Lane = Ext.getConstantOperandVal(1);
15549 if (VecVT.getVectorNumElements() != 4)
15550 return SDValue();
15551
15552 // Find another extract, of Lane + 1
15553 auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) {
15554 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15555 isa<ConstantSDNode>(V->getOperand(1)) &&
15556 V->getConstantOperandVal(1) == Lane + 1 &&
15557 V->getOperand(0).getResNo() == ResNo;
15558 });
15559 if (OtherIt == Op0->uses().end())
15560 return SDValue();
15561
15562 // For float extracts, we need to be converting to a i32 for both vector
15563 // lanes.
15564 SDValue OtherExt(*OtherIt, 0);
15565 if (OtherExt.getValueType() != MVT::i32) {
15566 if (OtherExt->use_size() != 1 ||
15567 OtherExt->use_begin()->getOpcode() != ISD::BITCAST ||
15568 OtherExt->use_begin()->getValueType(0) != MVT::i32)
15569 return SDValue();
15570 OtherExt = SDValue(*OtherExt->use_begin(), 0);
15571 }
15572
15573 // Convert the type to a f64 and extract with a VMOVRRD.
15574 SDValue F64 = DCI.DAG.getNode(
15575 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15576 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15577 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15578 SDValue VMOVRRD =
15579 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15580
15581 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15582 return VMOVRRD;
15583}
15584
15587 const ARMSubtarget *ST) {
15588 SDValue Op0 = N->getOperand(0);
15589 EVT VT = N->getValueType(0);
15590 SDLoc dl(N);
15591
15592 // extract (vdup x) -> x
15593 if (Op0->getOpcode() == ARMISD::VDUP) {
15594 SDValue X = Op0->getOperand(0);
15595 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15596 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15597 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15598 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15599 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15600 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15601
15602 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15603 X = X->getOperand(0);
15604 if (X.getValueType() == VT)
15605 return X;
15606 }
15607
15608 // extract ARM_BUILD_VECTOR -> x
15609 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15610 isa<ConstantSDNode>(N->getOperand(1)) &&
15611 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15612 return Op0.getOperand(N->getConstantOperandVal(1));
15613 }
15614
15615 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15616 if (Op0.getValueType() == MVT::v4i32 &&
15617 isa<ConstantSDNode>(N->getOperand(1)) &&
15618 Op0.getOpcode() == ISD::BITCAST &&
15620 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15621 SDValue BV = Op0.getOperand(0);
15622 unsigned Offset = N->getConstantOperandVal(1);
15623 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15624 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15625 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15626 }
15627
15628 // extract x, n; extract x, n+1 -> VMOVRRD x
15629 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15630 return R;
15631
15632 // extract (MVETrunc(x)) -> extract x
15633 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15634 unsigned Idx = N->getConstantOperandVal(1);
15635 unsigned Vec =
15637 unsigned SubIdx =
15639 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15640 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15641 }
15642
15643 return SDValue();
15644}
15645
15647 SDValue Op = N->getOperand(0);
15648 EVT VT = N->getValueType(0);
15649
15650 // sext_inreg(VGETLANEu) -> VGETLANEs
15651 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15652 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15653 Op.getOperand(0).getValueType().getScalarType())
15654 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15655 Op.getOperand(1));
15656
15657 return SDValue();
15658}
15659
15660static SDValue
15662 SDValue Vec = N->getOperand(0);
15663 SDValue SubVec = N->getOperand(1);
15664 uint64_t IdxVal = N->getConstantOperandVal(2);
15665 EVT VecVT = Vec.getValueType();
15666 EVT SubVT = SubVec.getValueType();
15667
15668 // Only do this for legal fixed vector types.
15669 if (!VecVT.isFixedLengthVector() ||
15670 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15672 return SDValue();
15673
15674 // Ignore widening patterns.
15675 if (IdxVal == 0 && Vec.isUndef())
15676 return SDValue();
15677
15678 // Subvector must be half the width and an "aligned" insertion.
15679 unsigned NumSubElts = SubVT.getVectorNumElements();
15680 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15681 (IdxVal != 0 && IdxVal != NumSubElts))
15682 return SDValue();
15683
15684 // Fold insert_subvector -> concat_vectors
15685 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15686 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15687 SDLoc DL(N);
15688 SDValue Lo, Hi;
15689 if (IdxVal == 0) {
15690 Lo = SubVec;
15691 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15692 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15693 } else {
15694 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15695 DCI.DAG.getVectorIdxConstant(0, DL));
15696 Hi = SubVec;
15697 }
15698 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15699}
15700
15701// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15703 SelectionDAG &DAG) {
15704 SDValue Trunc = N->getOperand(0);
15705 EVT VT = Trunc.getValueType();
15706 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15707 return SDValue();
15708
15709 SDLoc DL(Trunc);
15710 if (isVMOVNTruncMask(N->getMask(), VT, false))
15711 return DAG.getNode(
15712 ARMISD::VMOVN, DL, VT,
15713 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15714 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15715 DAG.getConstant(1, DL, MVT::i32));
15716 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15717 return DAG.getNode(
15718 ARMISD::VMOVN, DL, VT,
15719 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15720 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15721 DAG.getConstant(1, DL, MVT::i32));
15722 return SDValue();
15723}
15724
15725/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15726/// ISD::VECTOR_SHUFFLE.
15729 return R;
15730
15731 // The LLVM shufflevector instruction does not require the shuffle mask
15732 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15733 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15734 // operands do not match the mask length, they are extended by concatenating
15735 // them with undef vectors. That is probably the right thing for other
15736 // targets, but for NEON it is better to concatenate two double-register
15737 // size vector operands into a single quad-register size vector. Do that
15738 // transformation here:
15739 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15740 // shuffle(concat(v1, v2), undef)
15741 SDValue Op0 = N->getOperand(0);
15742 SDValue Op1 = N->getOperand(1);
15743 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15744 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15745 Op0.getNumOperands() != 2 ||
15746 Op1.getNumOperands() != 2)
15747 return SDValue();
15748 SDValue Concat0Op1 = Op0.getOperand(1);
15749 SDValue Concat1Op1 = Op1.getOperand(1);
15750 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15751 return SDValue();
15752 // Skip the transformation if any of the types are illegal.
15753 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15754 EVT VT = N->getValueType(0);
15755 if (!TLI.isTypeLegal(VT) ||
15756 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15757 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15758 return SDValue();
15759
15760 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15761 Op0.getOperand(0), Op1.getOperand(0));
15762 // Translate the shuffle mask.
15763 SmallVector<int, 16> NewMask;
15764 unsigned NumElts = VT.getVectorNumElements();
15765 unsigned HalfElts = NumElts/2;
15767 for (unsigned n = 0; n < NumElts; ++n) {
15768 int MaskElt = SVN->getMaskElt(n);
15769 int NewElt = -1;
15770 if (MaskElt < (int)HalfElts)
15771 NewElt = MaskElt;
15772 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15773 NewElt = HalfElts + MaskElt - NumElts;
15774 NewMask.push_back(NewElt);
15775 }
15776 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15777 DAG.getUNDEF(VT), NewMask);
15778}
15779
15780/// Load/store instruction that can be merged with a base address
15781/// update
15786 unsigned AddrOpIdx;
15787};
15788
15790 /// Instruction that updates a pointer
15792 /// Pointer increment operand
15794 /// Pointer increment value if it is a constant, or 0 otherwise
15795 unsigned ConstInc;
15796};
15797
15799 struct BaseUpdateUser &User,
15800 bool SimpleConstIncOnly,
15802 SelectionDAG &DAG = DCI.DAG;
15803 SDNode *N = Target.N;
15804 MemSDNode *MemN = cast<MemSDNode>(N);
15805 SDLoc dl(N);
15806
15807 // Find the new opcode for the updating load/store.
15808 bool isLoadOp = true;
15809 bool isLaneOp = false;
15810 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15811 // as an operand.
15812 bool hasAlignment = true;
15813 unsigned NewOpc = 0;
15814 unsigned NumVecs = 0;
15815 if (Target.isIntrinsic) {
15816 unsigned IntNo = N->getConstantOperandVal(1);
15817 switch (IntNo) {
15818 default:
15819 llvm_unreachable("unexpected intrinsic for Neon base update");
15820 case Intrinsic::arm_neon_vld1:
15821 NewOpc = ARMISD::VLD1_UPD;
15822 NumVecs = 1;
15823 break;
15824 case Intrinsic::arm_neon_vld2:
15825 NewOpc = ARMISD::VLD2_UPD;
15826 NumVecs = 2;
15827 break;
15828 case Intrinsic::arm_neon_vld3:
15829 NewOpc = ARMISD::VLD3_UPD;
15830 NumVecs = 3;
15831 break;
15832 case Intrinsic::arm_neon_vld4:
15833 NewOpc = ARMISD::VLD4_UPD;
15834 NumVecs = 4;
15835 break;
15836 case Intrinsic::arm_neon_vld1x2:
15837 NewOpc = ARMISD::VLD1x2_UPD;
15838 NumVecs = 2;
15839 hasAlignment = false;
15840 break;
15841 case Intrinsic::arm_neon_vld1x3:
15842 NewOpc = ARMISD::VLD1x3_UPD;
15843 NumVecs = 3;
15844 hasAlignment = false;
15845 break;
15846 case Intrinsic::arm_neon_vld1x4:
15847 NewOpc = ARMISD::VLD1x4_UPD;
15848 NumVecs = 4;
15849 hasAlignment = false;
15850 break;
15851 case Intrinsic::arm_neon_vld2dup:
15852 NewOpc = ARMISD::VLD2DUP_UPD;
15853 NumVecs = 2;
15854 break;
15855 case Intrinsic::arm_neon_vld3dup:
15856 NewOpc = ARMISD::VLD3DUP_UPD;
15857 NumVecs = 3;
15858 break;
15859 case Intrinsic::arm_neon_vld4dup:
15860 NewOpc = ARMISD::VLD4DUP_UPD;
15861 NumVecs = 4;
15862 break;
15863 case Intrinsic::arm_neon_vld2lane:
15864 NewOpc = ARMISD::VLD2LN_UPD;
15865 NumVecs = 2;
15866 isLaneOp = true;
15867 break;
15868 case Intrinsic::arm_neon_vld3lane:
15869 NewOpc = ARMISD::VLD3LN_UPD;
15870 NumVecs = 3;
15871 isLaneOp = true;
15872 break;
15873 case Intrinsic::arm_neon_vld4lane:
15874 NewOpc = ARMISD::VLD4LN_UPD;
15875 NumVecs = 4;
15876 isLaneOp = true;
15877 break;
15878 case Intrinsic::arm_neon_vst1:
15879 NewOpc = ARMISD::VST1_UPD;
15880 NumVecs = 1;
15881 isLoadOp = false;
15882 break;
15883 case Intrinsic::arm_neon_vst2:
15884 NewOpc = ARMISD::VST2_UPD;
15885 NumVecs = 2;
15886 isLoadOp = false;
15887 break;
15888 case Intrinsic::arm_neon_vst3:
15889 NewOpc = ARMISD::VST3_UPD;
15890 NumVecs = 3;
15891 isLoadOp = false;
15892 break;
15893 case Intrinsic::arm_neon_vst4:
15894 NewOpc = ARMISD::VST4_UPD;
15895 NumVecs = 4;
15896 isLoadOp = false;
15897 break;
15898 case Intrinsic::arm_neon_vst2lane:
15899 NewOpc = ARMISD::VST2LN_UPD;
15900 NumVecs = 2;
15901 isLoadOp = false;
15902 isLaneOp = true;
15903 break;
15904 case Intrinsic::arm_neon_vst3lane:
15905 NewOpc = ARMISD::VST3LN_UPD;
15906 NumVecs = 3;
15907 isLoadOp = false;
15908 isLaneOp = true;
15909 break;
15910 case Intrinsic::arm_neon_vst4lane:
15911 NewOpc = ARMISD::VST4LN_UPD;
15912 NumVecs = 4;
15913 isLoadOp = false;
15914 isLaneOp = true;
15915 break;
15916 case Intrinsic::arm_neon_vst1x2:
15917 NewOpc = ARMISD::VST1x2_UPD;
15918 NumVecs = 2;
15919 isLoadOp = false;
15920 hasAlignment = false;
15921 break;
15922 case Intrinsic::arm_neon_vst1x3:
15923 NewOpc = ARMISD::VST1x3_UPD;
15924 NumVecs = 3;
15925 isLoadOp = false;
15926 hasAlignment = false;
15927 break;
15928 case Intrinsic::arm_neon_vst1x4:
15929 NewOpc = ARMISD::VST1x4_UPD;
15930 NumVecs = 4;
15931 isLoadOp = false;
15932 hasAlignment = false;
15933 break;
15934 }
15935 } else {
15936 isLaneOp = true;
15937 switch (N->getOpcode()) {
15938 default:
15939 llvm_unreachable("unexpected opcode for Neon base update");
15940 case ARMISD::VLD1DUP:
15941 NewOpc = ARMISD::VLD1DUP_UPD;
15942 NumVecs = 1;
15943 break;
15944 case ARMISD::VLD2DUP:
15945 NewOpc = ARMISD::VLD2DUP_UPD;
15946 NumVecs = 2;
15947 break;
15948 case ARMISD::VLD3DUP:
15949 NewOpc = ARMISD::VLD3DUP_UPD;
15950 NumVecs = 3;
15951 break;
15952 case ARMISD::VLD4DUP:
15953 NewOpc = ARMISD::VLD4DUP_UPD;
15954 NumVecs = 4;
15955 break;
15956 case ISD::LOAD:
15957 NewOpc = ARMISD::VLD1_UPD;
15958 NumVecs = 1;
15959 isLaneOp = false;
15960 break;
15961 case ISD::STORE:
15962 NewOpc = ARMISD::VST1_UPD;
15963 NumVecs = 1;
15964 isLaneOp = false;
15965 isLoadOp = false;
15966 break;
15967 }
15968 }
15969
15970 // Find the size of memory referenced by the load/store.
15971 EVT VecTy;
15972 if (isLoadOp) {
15973 VecTy = N->getValueType(0);
15974 } else if (Target.isIntrinsic) {
15975 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
15976 } else {
15977 assert(Target.isStore &&
15978 "Node has to be a load, a store, or an intrinsic!");
15979 VecTy = N->getOperand(1).getValueType();
15980 }
15981
15982 bool isVLDDUPOp =
15983 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
15984 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
15985
15986 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15987 if (isLaneOp || isVLDDUPOp)
15988 NumBytes /= VecTy.getVectorNumElements();
15989
15990 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
15991 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
15992 // separate instructions that make it harder to use a non-constant update.
15993 return false;
15994 }
15995
15996 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
15997 return false;
15998
15999 // OK, we found an ADD we can fold into the base update.
16000 // Now, create a _UPD node, taking care of not breaking alignment.
16001
16002 EVT AlignedVecTy = VecTy;
16003 Align Alignment = MemN->getAlign();
16004
16005 // If this is a less-than-standard-aligned load/store, change the type to
16006 // match the standard alignment.
16007 // The alignment is overlooked when selecting _UPD variants; and it's
16008 // easier to introduce bitcasts here than fix that.
16009 // There are 3 ways to get to this base-update combine:
16010 // - intrinsics: they are assumed to be properly aligned (to the standard
16011 // alignment of the memory type), so we don't need to do anything.
16012 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
16013 // intrinsics, so, likewise, there's nothing to do.
16014 // - generic load/store instructions: the alignment is specified as an
16015 // explicit operand, rather than implicitly as the standard alignment
16016 // of the memory type (like the intrisics). We need to change the
16017 // memory type to match the explicit alignment. That way, we don't
16018 // generate non-standard-aligned ARMISD::VLDx nodes.
16019 if (isa<LSBaseSDNode>(N)) {
16020 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
16021 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
16022 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
16023 assert(!isLaneOp && "Unexpected generic load/store lane.");
16024 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
16025 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
16026 }
16027 // Don't set an explicit alignment on regular load/stores that we want
16028 // to transform to VLD/VST 1_UPD nodes.
16029 // This matches the behavior of regular load/stores, which only get an
16030 // explicit alignment if the MMO alignment is larger than the standard
16031 // alignment of the memory type.
16032 // Intrinsics, however, always get an explicit alignment, set to the
16033 // alignment of the MMO.
16034 Alignment = Align(1);
16035 }
16036
16037 // Create the new updating load/store node.
16038 // First, create an SDVTList for the new updating node's results.
16039 EVT Tys[6];
16040 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16041 unsigned n;
16042 for (n = 0; n < NumResultVecs; ++n)
16043 Tys[n] = AlignedVecTy;
16044 Tys[n++] = MVT::i32;
16045 Tys[n] = MVT::Other;
16046 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16047
16048 // Then, gather the new node's operands.
16050 Ops.push_back(N->getOperand(0)); // incoming chain
16051 Ops.push_back(N->getOperand(Target.AddrOpIdx));
16052 Ops.push_back(User.Inc);
16053
16054 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
16055 // Try to match the intrinsic's signature
16056 Ops.push_back(StN->getValue());
16057 } else {
16058 // Loads (and of course intrinsics) match the intrinsics' signature,
16059 // so just add all but the alignment operand.
16060 unsigned LastOperand =
16061 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
16062 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
16063 Ops.push_back(N->getOperand(i));
16064 }
16065
16066 // For all node types, the alignment operand is always the last one.
16067 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
16068
16069 // If this is a non-standard-aligned STORE, the penultimate operand is the
16070 // stored value. Bitcast it to the aligned type.
16071 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
16072 SDValue &StVal = Ops[Ops.size() - 2];
16073 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
16074 }
16075
16076 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
16077 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
16078 MemN->getMemOperand());
16079
16080 // Update the uses.
16081 SmallVector<SDValue, 5> NewResults;
16082 for (unsigned i = 0; i < NumResultVecs; ++i)
16083 NewResults.push_back(SDValue(UpdN.getNode(), i));
16084
16085 // If this is an non-standard-aligned LOAD, the first result is the loaded
16086 // value. Bitcast it to the expected result type.
16087 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
16088 SDValue &LdVal = NewResults[0];
16089 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
16090 }
16091
16092 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16093 DCI.CombineTo(N, NewResults);
16094 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
16095
16096 return true;
16097}
16098
16099// If (opcode ptr inc) is and ADD-like instruction, return the
16100// increment value. Otherwise return 0.
16101static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
16102 SDValue Inc, const SelectionDAG &DAG) {
16104 if (!CInc)
16105 return 0;
16106
16107 switch (Opcode) {
16108 case ARMISD::VLD1_UPD:
16109 case ISD::ADD:
16110 return CInc->getZExtValue();
16111 case ISD::OR: {
16112 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
16113 // (OR ptr inc) is the same as (ADD ptr inc)
16114 return CInc->getZExtValue();
16115 }
16116 return 0;
16117 }
16118 default:
16119 return 0;
16120 }
16121}
16122
16124 switch (N->getOpcode()) {
16125 case ISD::ADD:
16126 case ISD::OR: {
16127 if (isa<ConstantSDNode>(N->getOperand(1))) {
16128 *Ptr = N->getOperand(0);
16129 *CInc = N->getOperand(1);
16130 return true;
16131 }
16132 return false;
16133 }
16134 case ARMISD::VLD1_UPD: {
16135 if (isa<ConstantSDNode>(N->getOperand(2))) {
16136 *Ptr = N->getOperand(1);
16137 *CInc = N->getOperand(2);
16138 return true;
16139 }
16140 return false;
16141 }
16142 default:
16143 return false;
16144 }
16145}
16146
16148 // Check that the add is independent of the load/store.
16149 // Otherwise, folding it would create a cycle. Search through Addr
16150 // as well, since the User may not be a direct user of Addr and
16151 // only share a base pointer.
16154 Worklist.push_back(N);
16155 Worklist.push_back(User);
16156 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
16157 SDNode::hasPredecessorHelper(User, Visited, Worklist))
16158 return false;
16159 return true;
16160}
16161
16162/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
16163/// NEON load/store intrinsics, and generic vector load/stores, to merge
16164/// base address updates.
16165/// For generic load/stores, the memory type is assumed to be a vector.
16166/// The caller is assumed to have checked legality.
16169 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
16170 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
16171 const bool isStore = N->getOpcode() == ISD::STORE;
16172 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
16173 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
16174
16175 SDValue Addr = N->getOperand(AddrOpIdx);
16176
16178
16179 // Search for a use of the address operand that is an increment.
16180 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
16181 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
16182 SDNode *User = *UI;
16183 if (UI.getUse().getResNo() != Addr.getResNo() ||
16184 User->getNumOperands() != 2)
16185 continue;
16186
16187 SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1);
16188 unsigned ConstInc =
16189 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
16190
16191 if (ConstInc || User->getOpcode() == ISD::ADD)
16192 BaseUpdates.push_back({User, Inc, ConstInc});
16193 }
16194
16195 // If the address is a constant pointer increment itself, find
16196 // another constant increment that has the same base operand
16197 SDValue Base;
16198 SDValue CInc;
16199 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
16200 unsigned Offset =
16201 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
16202 for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end();
16203 UI != UE; ++UI) {
16204
16205 SDNode *User = *UI;
16206 if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() ||
16207 User->getNumOperands() != 2)
16208 continue;
16209
16210 SDValue UserInc = User->getOperand(UI.getOperandNo() == 0 ? 1 : 0);
16211 unsigned UserOffset =
16212 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16213
16214 if (!UserOffset || UserOffset <= Offset)
16215 continue;
16216
16217 unsigned NewConstInc = UserOffset - Offset;
16218 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16219 BaseUpdates.push_back({User, NewInc, NewConstInc});
16220 }
16221 }
16222
16223 // Try to fold the load/store with an update that matches memory
16224 // access size. This should work well for sequential loads.
16225 //
16226 // Filter out invalid updates as well.
16227 unsigned NumValidUpd = BaseUpdates.size();
16228 for (unsigned I = 0; I < NumValidUpd;) {
16229 BaseUpdateUser &User = BaseUpdates[I];
16230 if (!isValidBaseUpdate(N, User.N)) {
16231 --NumValidUpd;
16232 std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]);
16233 continue;
16234 }
16235
16236 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16237 return SDValue();
16238 ++I;
16239 }
16240 BaseUpdates.resize(NumValidUpd);
16241
16242 // Try to fold with other users. Non-constant updates are considered
16243 // first, and constant updates are sorted to not break a sequence of
16244 // strided accesses (if there is any).
16245 std::stable_sort(BaseUpdates.begin(), BaseUpdates.end(),
16246 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16247 return LHS.ConstInc < RHS.ConstInc;
16248 });
16249 for (BaseUpdateUser &User : BaseUpdates) {
16250 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16251 return SDValue();
16252 }
16253 return SDValue();
16254}
16255
16258 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16259 return SDValue();
16260
16261 return CombineBaseUpdate(N, DCI);
16262}
16263
16266 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16267 return SDValue();
16268
16269 SelectionDAG &DAG = DCI.DAG;
16270 SDValue Addr = N->getOperand(2);
16271 MemSDNode *MemN = cast<MemSDNode>(N);
16272 SDLoc dl(N);
16273
16274 // For the stores, where there are multiple intrinsics we only actually want
16275 // to post-inc the last of the them.
16276 unsigned IntNo = N->getConstantOperandVal(1);
16277 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16278 return SDValue();
16279 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16280 return SDValue();
16281
16282 // Search for a use of the address operand that is an increment.
16283 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
16284 UE = Addr.getNode()->use_end();
16285 UI != UE; ++UI) {
16286 SDNode *User = *UI;
16287 if (User->getOpcode() != ISD::ADD ||
16288 UI.getUse().getResNo() != Addr.getResNo())
16289 continue;
16290
16291 // Check that the add is independent of the load/store. Otherwise, folding
16292 // it would create a cycle. We can avoid searching through Addr as it's a
16293 // predecessor to both.
16296 Visited.insert(Addr.getNode());
16297 Worklist.push_back(N);
16298 Worklist.push_back(User);
16299 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
16300 SDNode::hasPredecessorHelper(User, Visited, Worklist))
16301 continue;
16302
16303 // Find the new opcode for the updating load/store.
16304 bool isLoadOp = true;
16305 unsigned NewOpc = 0;
16306 unsigned NumVecs = 0;
16307 switch (IntNo) {
16308 default:
16309 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16310 case Intrinsic::arm_mve_vld2q:
16311 NewOpc = ARMISD::VLD2_UPD;
16312 NumVecs = 2;
16313 break;
16314 case Intrinsic::arm_mve_vld4q:
16315 NewOpc = ARMISD::VLD4_UPD;
16316 NumVecs = 4;
16317 break;
16318 case Intrinsic::arm_mve_vst2q:
16319 NewOpc = ARMISD::VST2_UPD;
16320 NumVecs = 2;
16321 isLoadOp = false;
16322 break;
16323 case Intrinsic::arm_mve_vst4q:
16324 NewOpc = ARMISD::VST4_UPD;
16325 NumVecs = 4;
16326 isLoadOp = false;
16327 break;
16328 }
16329
16330 // Find the size of memory referenced by the load/store.
16331 EVT VecTy;
16332 if (isLoadOp) {
16333 VecTy = N->getValueType(0);
16334 } else {
16335 VecTy = N->getOperand(3).getValueType();
16336 }
16337
16338 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16339
16340 // If the increment is a constant, it must match the memory ref size.
16341 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16343 if (!CInc || CInc->getZExtValue() != NumBytes)
16344 continue;
16345
16346 // Create the new updating load/store node.
16347 // First, create an SDVTList for the new updating node's results.
16348 EVT Tys[6];
16349 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16350 unsigned n;
16351 for (n = 0; n < NumResultVecs; ++n)
16352 Tys[n] = VecTy;
16353 Tys[n++] = MVT::i32;
16354 Tys[n] = MVT::Other;
16355 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16356
16357 // Then, gather the new node's operands.
16359 Ops.push_back(N->getOperand(0)); // incoming chain
16360 Ops.push_back(N->getOperand(2)); // ptr
16361 Ops.push_back(Inc);
16362
16363 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16364 Ops.push_back(N->getOperand(i));
16365
16366 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16367 MemN->getMemOperand());
16368
16369 // Update the uses.
16370 SmallVector<SDValue, 5> NewResults;
16371 for (unsigned i = 0; i < NumResultVecs; ++i)
16372 NewResults.push_back(SDValue(UpdN.getNode(), i));
16373
16374 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16375 DCI.CombineTo(N, NewResults);
16376 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16377
16378 break;
16379 }
16380
16381 return SDValue();
16382}
16383
16384/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16385/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16386/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16387/// return true.
16389 SelectionDAG &DAG = DCI.DAG;
16390 EVT VT = N->getValueType(0);
16391 // vldN-dup instructions only support 64-bit vectors for N > 1.
16392 if (!VT.is64BitVector())
16393 return false;
16394
16395 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16396 SDNode *VLD = N->getOperand(0).getNode();
16397 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16398 return false;
16399 unsigned NumVecs = 0;
16400 unsigned NewOpc = 0;
16401 unsigned IntNo = VLD->getConstantOperandVal(1);
16402 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16403 NumVecs = 2;
16404 NewOpc = ARMISD::VLD2DUP;
16405 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16406 NumVecs = 3;
16407 NewOpc = ARMISD::VLD3DUP;
16408 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16409 NumVecs = 4;
16410 NewOpc = ARMISD::VLD4DUP;
16411 } else {
16412 return false;
16413 }
16414
16415 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16416 // numbers match the load.
16417 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16418 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
16419 UI != UE; ++UI) {
16420 // Ignore uses of the chain result.
16421 if (UI.getUse().getResNo() == NumVecs)
16422 continue;
16423 SDNode *User = *UI;
16424 if (User->getOpcode() != ARMISD::VDUPLANE ||
16425 VLDLaneNo != User->getConstantOperandVal(1))
16426 return false;
16427 }
16428
16429 // Create the vldN-dup node.
16430 EVT Tys[5];
16431 unsigned n;
16432 for (n = 0; n < NumVecs; ++n)
16433 Tys[n] = VT;
16434 Tys[n] = MVT::Other;
16435 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16436 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16438 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16439 Ops, VLDMemInt->getMemoryVT(),
16440 VLDMemInt->getMemOperand());
16441
16442 // Update the uses.
16443 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
16444 UI != UE; ++UI) {
16445 unsigned ResNo = UI.getUse().getResNo();
16446 // Ignore uses of the chain result.
16447 if (ResNo == NumVecs)
16448 continue;
16449 SDNode *User = *UI;
16450 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
16451 }
16452
16453 // Now the vldN-lane intrinsic is dead except for its chain result.
16454 // Update uses of the chain.
16455 std::vector<SDValue> VLDDupResults;
16456 for (unsigned n = 0; n < NumVecs; ++n)
16457 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16458 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16459 DCI.CombineTo(VLD, VLDDupResults);
16460
16461 return true;
16462}
16463
16464/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16465/// ARMISD::VDUPLANE.
16468 const ARMSubtarget *Subtarget) {
16469 SDValue Op = N->getOperand(0);
16470 EVT VT = N->getValueType(0);
16471
16472 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16473 if (Subtarget->hasMVEIntegerOps()) {
16474 EVT ExtractVT = VT.getVectorElementType();
16475 // We need to ensure we are creating a legal type.
16476 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16477 ExtractVT = MVT::i32;
16478 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16479 N->getOperand(0), N->getOperand(1));
16480 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16481 }
16482
16483 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16484 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16485 if (CombineVLDDUP(N, DCI))
16486 return SDValue(N, 0);
16487
16488 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16489 // redundant. Ignore bit_converts for now; element sizes are checked below.
16490 while (Op.getOpcode() == ISD::BITCAST)
16491 Op = Op.getOperand(0);
16492 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16493 return SDValue();
16494
16495 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16496 unsigned EltSize = Op.getScalarValueSizeInBits();
16497 // The canonical VMOV for a zero vector uses a 32-bit element size.
16498 unsigned Imm = Op.getConstantOperandVal(0);
16499 unsigned EltBits;
16500 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16501 EltSize = 8;
16502 if (EltSize > VT.getScalarSizeInBits())
16503 return SDValue();
16504
16505 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16506}
16507
16508/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16510 const ARMSubtarget *Subtarget) {
16511 SDValue Op = N->getOperand(0);
16512 SDLoc dl(N);
16513
16514 if (Subtarget->hasMVEIntegerOps()) {
16515 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16516 // need to come from a GPR.
16517 if (Op.getValueType() == MVT::f32)
16518 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16519 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16520 else if (Op.getValueType() == MVT::f16)
16521 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16522 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16523 }
16524
16525 if (!Subtarget->hasNEON())
16526 return SDValue();
16527
16528 // Match VDUP(LOAD) -> VLD1DUP.
16529 // We match this pattern here rather than waiting for isel because the
16530 // transform is only legal for unindexed loads.
16531 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16532 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16533 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16534 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16535 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16536 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16537 SDValue VLDDup =
16538 DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops,
16539 LD->getMemoryVT(), LD->getMemOperand());
16540 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16541 return VLDDup;
16542 }
16543
16544 return SDValue();
16545}
16546
16549 const ARMSubtarget *Subtarget) {
16550 EVT VT = N->getValueType(0);
16551
16552 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16553 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16555 return CombineBaseUpdate(N, DCI);
16556
16557 return SDValue();
16558}
16559
16560// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16561// pack all of the elements in one place. Next, store to memory in fewer
16562// chunks.
16564 SelectionDAG &DAG) {
16565 SDValue StVal = St->getValue();
16566 EVT VT = StVal.getValueType();
16567 if (!St->isTruncatingStore() || !VT.isVector())
16568 return SDValue();
16569 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16570 EVT StVT = St->getMemoryVT();
16571 unsigned NumElems = VT.getVectorNumElements();
16572 assert(StVT != VT && "Cannot truncate to the same type");
16573 unsigned FromEltSz = VT.getScalarSizeInBits();
16574 unsigned ToEltSz = StVT.getScalarSizeInBits();
16575
16576 // From, To sizes and ElemCount must be pow of two
16577 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16578 return SDValue();
16579
16580 // We are going to use the original vector elt for storing.
16581 // Accumulated smaller vector elements must be a multiple of the store size.
16582 if (0 != (NumElems * FromEltSz) % ToEltSz)
16583 return SDValue();
16584
16585 unsigned SizeRatio = FromEltSz / ToEltSz;
16586 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16587
16588 // Create a type on which we perform the shuffle.
16589 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16590 NumElems * SizeRatio);
16591 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16592
16593 SDLoc DL(St);
16594 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16595 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16596 for (unsigned i = 0; i < NumElems; ++i)
16597 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16598 : i * SizeRatio;
16599
16600 // Can't shuffle using an illegal type.
16601 if (!TLI.isTypeLegal(WideVecVT))
16602 return SDValue();
16603
16604 SDValue Shuff = DAG.getVectorShuffle(
16605 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16606 // At this point all of the data is stored at the bottom of the
16607 // register. We now need to save it to mem.
16608
16609 // Find the largest store unit
16610 MVT StoreType = MVT::i8;
16611 for (MVT Tp : MVT::integer_valuetypes()) {
16612 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16613 StoreType = Tp;
16614 }
16615 // Didn't find a legal store type.
16616 if (!TLI.isTypeLegal(StoreType))
16617 return SDValue();
16618
16619 // Bitcast the original vector into a vector of store-size units
16620 EVT StoreVecVT =
16621 EVT::getVectorVT(*DAG.getContext(), StoreType,
16622 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16623 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16624 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16626 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16627 TLI.getPointerTy(DAG.getDataLayout()));
16628 SDValue BasePtr = St->getBasePtr();
16629
16630 // Perform one or more big stores into memory.
16631 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16632 for (unsigned I = 0; I < E; I++) {
16633 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16634 ShuffWide, DAG.getIntPtrConstant(I, DL));
16635 SDValue Ch =
16636 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16637 St->getAlign(), St->getMemOperand()->getFlags());
16638 BasePtr =
16639 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16640 Chains.push_back(Ch);
16641 }
16642 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16643}
16644
16645// Try taking a single vector store from an fpround (which would otherwise turn
16646// into an expensive buildvector) and splitting it into a series of narrowing
16647// stores.
16649 SelectionDAG &DAG) {
16650 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16651 return SDValue();
16652 SDValue Trunc = St->getValue();
16653 if (Trunc->getOpcode() != ISD::FP_ROUND)
16654 return SDValue();
16655 EVT FromVT = Trunc->getOperand(0).getValueType();
16656 EVT ToVT = Trunc.getValueType();
16657 if (!ToVT.isVector())
16658 return SDValue();
16660 EVT ToEltVT = ToVT.getVectorElementType();
16661 EVT FromEltVT = FromVT.getVectorElementType();
16662
16663 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16664 return SDValue();
16665
16666 unsigned NumElements = 4;
16667 if (FromVT.getVectorNumElements() % NumElements != 0)
16668 return SDValue();
16669
16670 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
16671 // use the VMOVN over splitting the store. We are looking for patterns of:
16672 // !rev: 0 N 1 N+1 2 N+2 ...
16673 // rev: N 0 N+1 1 N+2 2 ...
16674 // The shuffle may either be a single source (in which case N = NumElts/2) or
16675 // two inputs extended with concat to the same size (in which case N =
16676 // NumElts).
16677 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16678 ArrayRef<int> M = SVN->getMask();
16679 unsigned NumElts = ToVT.getVectorNumElements();
16680 if (SVN->getOperand(1).isUndef())
16681 NumElts /= 2;
16682
16683 unsigned Off0 = Rev ? NumElts : 0;
16684 unsigned Off1 = Rev ? 0 : NumElts;
16685
16686 for (unsigned I = 0; I < NumElts; I += 2) {
16687 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16688 return false;
16689 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16690 return false;
16691 }
16692
16693 return true;
16694 };
16695
16696 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16697 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16698 return SDValue();
16699
16700 LLVMContext &C = *DAG.getContext();
16701 SDLoc DL(St);
16702 // Details about the old store
16703 SDValue Ch = St->getChain();
16704 SDValue BasePtr = St->getBasePtr();
16705 Align Alignment = St->getOriginalAlign();
16707 AAMDNodes AAInfo = St->getAAInfo();
16708
16709 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16710 // and then stored as truncating integer stores.
16711 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16712 EVT NewToVT = EVT::getVectorVT(
16713 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16714
16716 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16717 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16718 SDValue NewPtr =
16719 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16720
16721 SDValue Extract =
16722 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16723 DAG.getConstant(i * NumElements, DL, MVT::i32));
16724
16725 SDValue FPTrunc =
16726 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16727 Extract, DAG.getConstant(0, DL, MVT::i32));
16728 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16729
16730 SDValue Store = DAG.getTruncStore(
16731 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16732 NewToVT, Alignment, MMOFlags, AAInfo);
16733 Stores.push_back(Store);
16734 }
16735 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16736}
16737
16738// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16739// into an expensive buildvector) and splitting it into a series of narrowing
16740// stores.
16742 SelectionDAG &DAG) {
16743 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16744 return SDValue();
16745 SDValue Trunc = St->getValue();
16746 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16747 return SDValue();
16748 EVT FromVT = Trunc->getOperand(0).getValueType();
16749 EVT ToVT = Trunc.getValueType();
16750
16751 LLVMContext &C = *DAG.getContext();
16752 SDLoc DL(St);
16753 // Details about the old store
16754 SDValue Ch = St->getChain();
16755 SDValue BasePtr = St->getBasePtr();
16756 Align Alignment = St->getOriginalAlign();
16758 AAMDNodes AAInfo = St->getAAInfo();
16759
16760 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16761 FromVT.getVectorNumElements());
16762
16764 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16765 unsigned NewOffset =
16766 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16767 SDValue NewPtr =
16768 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16769
16770 SDValue Extract = Trunc.getOperand(i);
16771 SDValue Store = DAG.getTruncStore(
16772 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16773 NewToVT, Alignment, MMOFlags, AAInfo);
16774 Stores.push_back(Store);
16775 }
16776 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16777}
16778
16779// Given a floating point store from an extracted vector, with an integer
16780// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16781// help reduce fp register pressure, doesn't require the fp extract and allows
16782// use of more integer post-inc stores not available with vstr.
16784 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16785 return SDValue();
16786 SDValue Extract = St->getValue();
16787 EVT VT = Extract.getValueType();
16788 // For now only uses f16. This may be useful for f32 too, but that will
16789 // be bitcast(extract), not the VGETLANEu we currently check here.
16790 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16791 return SDValue();
16792
16793 SDNode *GetLane =
16794 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16795 {Extract.getOperand(0), Extract.getOperand(1)});
16796 if (!GetLane)
16797 return SDValue();
16798
16799 LLVMContext &C = *DAG.getContext();
16800 SDLoc DL(St);
16801 // Create a new integer store to replace the existing floating point version.
16802 SDValue Ch = St->getChain();
16803 SDValue BasePtr = St->getBasePtr();
16804 Align Alignment = St->getOriginalAlign();
16806 AAMDNodes AAInfo = St->getAAInfo();
16807 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16808 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16809 St->getPointerInfo(), NewToVT, Alignment,
16810 MMOFlags, AAInfo);
16811
16812 return Store;
16813}
16814
16815/// PerformSTORECombine - Target-specific dag combine xforms for
16816/// ISD::STORE.
16819 const ARMSubtarget *Subtarget) {
16821 if (St->isVolatile())
16822 return SDValue();
16823 SDValue StVal = St->getValue();
16824 EVT VT = StVal.getValueType();
16825
16826 if (Subtarget->hasNEON())
16827 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16828 return Store;
16829
16830 if (Subtarget->hasMVEFloatOps())
16831 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16832 return NewToken;
16833
16834 if (Subtarget->hasMVEIntegerOps()) {
16835 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16836 return NewChain;
16837 if (SDValue NewToken =
16839 return NewToken;
16840 }
16841
16842 if (!ISD::isNormalStore(St))
16843 return SDValue();
16844
16845 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16846 // ARM stores of arguments in the same cache line.
16847 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16848 StVal.getNode()->hasOneUse()) {
16849 SelectionDAG &DAG = DCI.DAG;
16850 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16851 SDLoc DL(St);
16852 SDValue BasePtr = St->getBasePtr();
16853 SDValue NewST1 = DAG.getStore(
16854 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16855 BasePtr, St->getPointerInfo(), St->getOriginalAlign(),
16856 St->getMemOperand()->getFlags());
16857
16858 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16859 DAG.getConstant(4, DL, MVT::i32));
16860 return DAG.getStore(NewST1.getValue(0), DL,
16861 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16862 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16863 St->getOriginalAlign(),
16864 St->getMemOperand()->getFlags());
16865 }
16866
16867 if (StVal.getValueType() == MVT::i64 &&
16869
16870 // Bitcast an i64 store extracted from a vector to f64.
16871 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16872 SelectionDAG &DAG = DCI.DAG;
16873 SDLoc dl(StVal);
16874 SDValue IntVec = StVal.getOperand(0);
16875 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16877 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16878 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16879 Vec, StVal.getOperand(1));
16880 dl = SDLoc(N);
16881 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16882 // Make the DAGCombiner fold the bitcasts.
16883 DCI.AddToWorklist(Vec.getNode());
16884 DCI.AddToWorklist(ExtElt.getNode());
16885 DCI.AddToWorklist(V.getNode());
16886 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16887 St->getPointerInfo(), St->getAlign(),
16888 St->getMemOperand()->getFlags(), St->getAAInfo());
16889 }
16890
16891 // If this is a legal vector store, try to combine it into a VST1_UPD.
16892 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16894 return CombineBaseUpdate(N, DCI);
16895
16896 return SDValue();
16897}
16898
16899/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16900/// can replace combinations of VMUL and VCVT (floating-point to integer)
16901/// when the VMUL has a constant operand that is a power of 2.
16902///
16903/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16904/// vmul.f32 d16, d17, d16
16905/// vcvt.s32.f32 d16, d16
16906/// becomes:
16907/// vcvt.s32.f32 d16, d16, #3
16909 const ARMSubtarget *Subtarget) {
16910 if (!Subtarget->hasNEON())
16911 return SDValue();
16912
16913 SDValue Op = N->getOperand(0);
16914 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16915 Op.getOpcode() != ISD::FMUL)
16916 return SDValue();
16917
16918 SDValue ConstVec = Op->getOperand(1);
16919 if (!isa<BuildVectorSDNode>(ConstVec))
16920 return SDValue();
16921
16922 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16923 uint32_t FloatBits = FloatTy.getSizeInBits();
16924 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
16925 uint32_t IntBits = IntTy.getSizeInBits();
16926 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16927 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16928 // These instructions only exist converting from f32 to i32. We can handle
16929 // smaller integers by generating an extra truncate, but larger ones would
16930 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16931 // these intructions only support v2i32/v4i32 types.
16932 return SDValue();
16933 }
16934
16935 BitVector UndefElements;
16937 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16938 if (C == -1 || C == 0 || C > 32)
16939 return SDValue();
16940
16941 SDLoc dl(N);
16942 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16943 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16944 Intrinsic::arm_neon_vcvtfp2fxu;
16945 SDValue FixConv = DAG.getNode(
16946 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16947 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16948 DAG.getConstant(C, dl, MVT::i32));
16949
16950 if (IntBits < FloatBits)
16951 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
16952
16953 return FixConv;
16954}
16955
16957 const ARMSubtarget *Subtarget) {
16958 if (!Subtarget->hasMVEFloatOps())
16959 return SDValue();
16960
16961 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
16962 // The second form can be more easily turned into a predicated vadd, and
16963 // possibly combined into a fma to become a predicated vfma.
16964 SDValue Op0 = N->getOperand(0);
16965 SDValue Op1 = N->getOperand(1);
16966 EVT VT = N->getValueType(0);
16967 SDLoc DL(N);
16968
16969 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
16970 // which these VMOV's represent.
16971 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
16972 if (Op.getOpcode() != ISD::BITCAST ||
16973 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
16974 return false;
16975 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
16976 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
16977 return true;
16978 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
16979 return true;
16980 return false;
16981 };
16982
16983 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
16984 std::swap(Op0, Op1);
16985
16986 if (Op1.getOpcode() != ISD::VSELECT)
16987 return SDValue();
16988
16989 SDNodeFlags FaddFlags = N->getFlags();
16990 bool NSZ = FaddFlags.hasNoSignedZeros();
16991 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
16992 return SDValue();
16993
16994 SDValue FAdd =
16995 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
16996 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
16997}
16998
17000 SDValue LHS = N->getOperand(0);
17001 SDValue RHS = N->getOperand(1);
17002 EVT VT = N->getValueType(0);
17003 SDLoc DL(N);
17004
17005 if (!N->getFlags().hasAllowReassociation())
17006 return SDValue();
17007
17008 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
17009 auto ReassocComplex = [&](SDValue A, SDValue B) {
17010 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
17011 return SDValue();
17012 unsigned Opc = A.getConstantOperandVal(0);
17013 if (Opc != Intrinsic::arm_mve_vcmlaq)
17014 return SDValue();
17015 SDValue VCMLA = DAG.getNode(
17016 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
17017 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
17018 A.getOperand(3), A.getOperand(4));
17019 VCMLA->setFlags(A->getFlags());
17020 return VCMLA;
17021 };
17022 if (SDValue R = ReassocComplex(LHS, RHS))
17023 return R;
17024 if (SDValue R = ReassocComplex(RHS, LHS))
17025 return R;
17026
17027 return SDValue();
17028}
17029
17031 const ARMSubtarget *Subtarget) {
17032 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
17033 return S;
17034 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
17035 return S;
17036 return SDValue();
17037}
17038
17039/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
17040/// can replace combinations of VCVT (integer to floating-point) and VMUL
17041/// when the VMUL has a constant operand that is a power of 2.
17042///
17043/// Example (assume d17 = <float 0.125, float 0.125>):
17044/// vcvt.f32.s32 d16, d16
17045/// vmul.f32 d16, d16, d17
17046/// becomes:
17047/// vcvt.f32.s32 d16, d16, #3
17049 const ARMSubtarget *Subtarget) {
17050 if (!Subtarget->hasNEON())
17051 return SDValue();
17052
17053 SDValue Op = N->getOperand(0);
17054 unsigned OpOpcode = Op.getNode()->getOpcode();
17055 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
17056 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
17057 return SDValue();
17058
17059 SDValue ConstVec = N->getOperand(1);
17060 if (!isa<BuildVectorSDNode>(ConstVec))
17061 return SDValue();
17062
17063 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17064 uint32_t FloatBits = FloatTy.getSizeInBits();
17065 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17066 uint32_t IntBits = IntTy.getSizeInBits();
17067 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17068 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17069 // These instructions only exist converting from i32 to f32. We can handle
17070 // smaller integers by generating an extra extend, but larger ones would
17071 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17072 // these intructions only support v2i32/v4i32 types.
17073 return SDValue();
17074 }
17075
17076 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
17077 APFloat Recip(0.0f);
17078 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
17079 return SDValue();
17080
17081 bool IsExact;
17082 APSInt IntVal(33);
17083 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
17084 APFloat::opOK ||
17085 !IsExact)
17086 return SDValue();
17087
17088 int32_t C = IntVal.exactLogBase2();
17089 if (C == -1 || C == 0 || C > 32)
17090 return SDValue();
17091
17092 SDLoc DL(N);
17093 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
17094 SDValue ConvInput = Op.getOperand(0);
17095 if (IntBits < FloatBits)
17097 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
17098
17099 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
17100 : Intrinsic::arm_neon_vcvtfxu2fp;
17101 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17102 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17103 DAG.getConstant(C, DL, MVT::i32));
17104}
17105
17107 const ARMSubtarget *ST) {
17108 if (!ST->hasMVEIntegerOps())
17109 return SDValue();
17110
17111 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
17112 EVT ResVT = N->getValueType(0);
17113 SDValue N0 = N->getOperand(0);
17114 SDLoc dl(N);
17115
17116 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
17117 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
17118 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
17119 N0.getValueType() == MVT::v16i8)) {
17120 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
17121 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
17122 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
17123 }
17124
17125 // We are looking for something that will have illegal types if left alone,
17126 // but that we can convert to a single instruction under MVE. For example
17127 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
17128 // or
17129 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
17130
17131 // The legal cases are:
17132 // VADDV u/s 8/16/32
17133 // VMLAV u/s 8/16/32
17134 // VADDLV u/s 32
17135 // VMLALV u/s 16/32
17136
17137 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
17138 // extend it and use v4i32 instead.
17139 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
17140 EVT AVT = A.getValueType();
17141 return any_of(ExtTypes, [&](MVT Ty) {
17142 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
17143 AVT.bitsLE(Ty);
17144 });
17145 };
17146 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
17147 EVT AVT = A.getValueType();
17148 if (!AVT.is128BitVector())
17149 A = DAG.getNode(ExtendCode, dl,
17151 128 / AVT.getVectorMinNumElements())),
17152 A);
17153 return A;
17154 };
17155 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
17156 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
17157 return SDValue();
17158 SDValue A = N0->getOperand(0);
17159 if (ExtTypeMatches(A, ExtTypes))
17160 return ExtendIfNeeded(A, ExtendCode);
17161 return SDValue();
17162 };
17163 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
17164 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
17165 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17167 return SDValue();
17168 Mask = N0->getOperand(0);
17169 SDValue Ext = N0->getOperand(1);
17170 if (Ext->getOpcode() != ExtendCode)
17171 return SDValue();
17172 SDValue A = Ext->getOperand(0);
17173 if (ExtTypeMatches(A, ExtTypes))
17174 return ExtendIfNeeded(A, ExtendCode);
17175 return SDValue();
17176 };
17177 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17178 SDValue &A, SDValue &B) {
17179 // For a vmla we are trying to match a larger pattern:
17180 // ExtA = sext/zext A
17181 // ExtB = sext/zext B
17182 // Mul = mul ExtA, ExtB
17183 // vecreduce.add Mul
17184 // There might also be en extra extend between the mul and the addreduce, so
17185 // long as the bitwidth is high enough to make them equivalent (for example
17186 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
17187 if (ResVT != RetTy)
17188 return false;
17189 SDValue Mul = N0;
17190 if (Mul->getOpcode() == ExtendCode &&
17191 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17192 ResVT.getScalarSizeInBits())
17193 Mul = Mul->getOperand(0);
17194 if (Mul->getOpcode() != ISD::MUL)
17195 return false;
17196 SDValue ExtA = Mul->getOperand(0);
17197 SDValue ExtB = Mul->getOperand(1);
17198 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17199 return false;
17200 A = ExtA->getOperand(0);
17201 B = ExtB->getOperand(0);
17202 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17203 A = ExtendIfNeeded(A, ExtendCode);
17204 B = ExtendIfNeeded(B, ExtendCode);
17205 return true;
17206 }
17207 return false;
17208 };
17209 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17210 SDValue &A, SDValue &B, SDValue &Mask) {
17211 // Same as the pattern above with a select for the zero predicated lanes
17212 // ExtA = sext/zext A
17213 // ExtB = sext/zext B
17214 // Mul = mul ExtA, ExtB
17215 // N0 = select Mask, Mul, 0
17216 // vecreduce.add N0
17217 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17219 return false;
17220 Mask = N0->getOperand(0);
17221 SDValue Mul = N0->getOperand(1);
17222 if (Mul->getOpcode() == ExtendCode &&
17223 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17224 ResVT.getScalarSizeInBits())
17225 Mul = Mul->getOperand(0);
17226 if (Mul->getOpcode() != ISD::MUL)
17227 return false;
17228 SDValue ExtA = Mul->getOperand(0);
17229 SDValue ExtB = Mul->getOperand(1);
17230 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17231 return false;
17232 A = ExtA->getOperand(0);
17233 B = ExtB->getOperand(0);
17234 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17235 A = ExtendIfNeeded(A, ExtendCode);
17236 B = ExtendIfNeeded(B, ExtendCode);
17237 return true;
17238 }
17239 return false;
17240 };
17241 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17242 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17243 // reductions. The operands are extended with MVEEXT, but as they are
17244 // reductions the lane orders do not matter. MVEEXT may be combined with
17245 // loads to produce two extending loads, or else they will be expanded to
17246 // VREV/VMOVL.
17247 EVT VT = Ops[0].getValueType();
17248 if (VT == MVT::v16i8) {
17249 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17250 "Unexpected illegal long reduction opcode");
17251 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17252
17253 SDValue Ext0 =
17254 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17255 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17256 SDValue Ext1 =
17257 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17258 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17259
17260 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17261 Ext0, Ext1);
17262 SDValue MLA1 =
17263 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17264 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17265 Ext0.getValue(1), Ext1.getValue(1));
17266 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17267 }
17268 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17269 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17270 SDValue(Node.getNode(), 1));
17271 };
17272
17273 SDValue A, B;
17274 SDValue Mask;
17275 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17276 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17277 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17278 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17279 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17280 A, B))
17281 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17282 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17283 A, B))
17284 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17285 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17286 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17287 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17288 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17289 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17290 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17291
17292 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17293 Mask))
17294 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17295 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17296 Mask))
17297 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17298 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17299 Mask))
17300 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17301 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17302 Mask))
17303 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17304 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17305 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17306 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17307 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17308 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17309 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17310
17311 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17312 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17313 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17314 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17315 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17316 return Create64bitNode(ARMISD::VADDLVs, {A});
17317 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17318 return Create64bitNode(ARMISD::VADDLVu, {A});
17319 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17320 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17321 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17322 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17323 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17324 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17325
17326 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17327 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17328 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17329 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17330 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17331 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17332 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17333 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17334 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17335 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17336 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17337 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17338 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17339 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17340
17341 // Some complications. We can get a case where the two inputs of the mul are
17342 // the same, then the output sext will have been helpfully converted to a
17343 // zext. Turn it back.
17344 SDValue Op = N0;
17345 if (Op->getOpcode() == ISD::VSELECT)
17346 Op = Op->getOperand(1);
17347 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17348 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17349 SDValue Mul = Op->getOperand(0);
17350 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17351 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17352 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17353 if (Op != N0)
17354 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17355 N0->getOperand(0), Ext, N0->getOperand(2));
17356 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17357 }
17358 }
17359
17360 return SDValue();
17361}
17362
17363// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17364// the lanes are used. Due to the reduction being commutative the shuffle can be
17365// removed.
17367 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17368 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17369 if (!Shuf || !Shuf->getOperand(1).isUndef())
17370 return SDValue();
17371
17372 // Check all elements are used once in the mask.
17373 ArrayRef<int> Mask = Shuf->getMask();
17374 APInt SetElts(Mask.size(), 0);
17375 for (int E : Mask) {
17376 if (E < 0 || E >= (int)Mask.size())
17377 return SDValue();
17378 SetElts.setBit(E);
17379 }
17380 if (!SetElts.isAllOnes())
17381 return SDValue();
17382
17383 if (N->getNumOperands() != VecOp + 1) {
17384 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17385 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17386 return SDValue();
17387 }
17388
17390 for (SDValue Op : N->ops()) {
17391 if (Op.getValueType().isVector())
17392 Ops.push_back(Op.getOperand(0));
17393 else
17394 Ops.push_back(Op);
17395 }
17396 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17397}
17398
17401 SDValue Op0 = N->getOperand(0);
17402 SDValue Op1 = N->getOperand(1);
17403 unsigned IsTop = N->getConstantOperandVal(2);
17404
17405 // VMOVNT a undef -> a
17406 // VMOVNB a undef -> a
17407 // VMOVNB undef a -> a
17408 if (Op1->isUndef())
17409 return Op0;
17410 if (Op0->isUndef() && !IsTop)
17411 return Op1;
17412
17413 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17414 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17415 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17416 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17417 Op1->getConstantOperandVal(2) == 0)
17418 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17419 Op0, Op1->getOperand(1), N->getOperand(2));
17420
17421 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17422 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17423 // into the top or bottom lanes.
17424 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17425 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17426 APInt Op0DemandedElts =
17427 IsTop ? Op1DemandedElts
17428 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17429
17430 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17431 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17432 return SDValue(N, 0);
17433 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17434 return SDValue(N, 0);
17435
17436 return SDValue();
17437}
17438
17441 SDValue Op0 = N->getOperand(0);
17442 unsigned IsTop = N->getConstantOperandVal(2);
17443
17444 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17445 APInt Op0DemandedElts =
17446 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17447 : APInt::getHighBitsSet(2, 1));
17448
17449 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17450 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17451 return SDValue(N, 0);
17452 return SDValue();
17453}
17454
17457 EVT VT = N->getValueType(0);
17458 SDValue LHS = N->getOperand(0);
17459 SDValue RHS = N->getOperand(1);
17460
17461 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17462 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17463 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17464 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17465 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17466 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17467 SDLoc DL(N);
17468 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17469 LHS.getOperand(0), RHS.getOperand(0));
17470 SDValue UndefV = LHS.getOperand(1);
17471 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17472 }
17473 return SDValue();
17474}
17475
17477 SDLoc DL(N);
17478 SDValue Op0 = N->getOperand(0);
17479 SDValue Op1 = N->getOperand(1);
17480
17481 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17482 // uses of the intrinsics.
17483 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17484 int ShiftAmt = C->getSExtValue();
17485 if (ShiftAmt == 0) {
17486 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17487 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17488 return SDValue();
17489 }
17490
17491 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17492 unsigned NewOpcode =
17493 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17494 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17495 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17496 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17497 return NewShift;
17498 }
17499 }
17500
17501 return SDValue();
17502}
17503
17504/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17506 DAGCombinerInfo &DCI) const {
17507 SelectionDAG &DAG = DCI.DAG;
17508 unsigned IntNo = N->getConstantOperandVal(0);
17509 switch (IntNo) {
17510 default:
17511 // Don't do anything for most intrinsics.
17512 break;
17513
17514 // Vector shifts: check for immediate versions and lower them.
17515 // Note: This is done during DAG combining instead of DAG legalizing because
17516 // the build_vectors for 64-bit vector element shift counts are generally
17517 // not legal, and it is hard to see their values after they get legalized to
17518 // loads from a constant pool.
17519 case Intrinsic::arm_neon_vshifts:
17520 case Intrinsic::arm_neon_vshiftu:
17521 case Intrinsic::arm_neon_vrshifts:
17522 case Intrinsic::arm_neon_vrshiftu:
17523 case Intrinsic::arm_neon_vrshiftn:
17524 case Intrinsic::arm_neon_vqshifts:
17525 case Intrinsic::arm_neon_vqshiftu:
17526 case Intrinsic::arm_neon_vqshiftsu:
17527 case Intrinsic::arm_neon_vqshiftns:
17528 case Intrinsic::arm_neon_vqshiftnu:
17529 case Intrinsic::arm_neon_vqshiftnsu:
17530 case Intrinsic::arm_neon_vqrshiftns:
17531 case Intrinsic::arm_neon_vqrshiftnu:
17532 case Intrinsic::arm_neon_vqrshiftnsu: {
17533 EVT VT = N->getOperand(1).getValueType();
17534 int64_t Cnt;
17535 unsigned VShiftOpc = 0;
17536
17537 switch (IntNo) {
17538 case Intrinsic::arm_neon_vshifts:
17539 case Intrinsic::arm_neon_vshiftu:
17540 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17541 VShiftOpc = ARMISD::VSHLIMM;
17542 break;
17543 }
17544 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17545 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17547 break;
17548 }
17549 return SDValue();
17550
17551 case Intrinsic::arm_neon_vrshifts:
17552 case Intrinsic::arm_neon_vrshiftu:
17553 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17554 break;
17555 return SDValue();
17556
17557 case Intrinsic::arm_neon_vqshifts:
17558 case Intrinsic::arm_neon_vqshiftu:
17559 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17560 break;
17561 return SDValue();
17562
17563 case Intrinsic::arm_neon_vqshiftsu:
17564 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17565 break;
17566 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17567
17568 case Intrinsic::arm_neon_vrshiftn:
17569 case Intrinsic::arm_neon_vqshiftns:
17570 case Intrinsic::arm_neon_vqshiftnu:
17571 case Intrinsic::arm_neon_vqshiftnsu:
17572 case Intrinsic::arm_neon_vqrshiftns:
17573 case Intrinsic::arm_neon_vqrshiftnu:
17574 case Intrinsic::arm_neon_vqrshiftnsu:
17575 // Narrowing shifts require an immediate right shift.
17576 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17577 break;
17578 llvm_unreachable("invalid shift count for narrowing vector shift "
17579 "intrinsic");
17580
17581 default:
17582 llvm_unreachable("unhandled vector shift");
17583 }
17584
17585 switch (IntNo) {
17586 case Intrinsic::arm_neon_vshifts:
17587 case Intrinsic::arm_neon_vshiftu:
17588 // Opcode already set above.
17589 break;
17590 case Intrinsic::arm_neon_vrshifts:
17591 VShiftOpc = ARMISD::VRSHRsIMM;
17592 break;
17593 case Intrinsic::arm_neon_vrshiftu:
17594 VShiftOpc = ARMISD::VRSHRuIMM;
17595 break;
17596 case Intrinsic::arm_neon_vrshiftn:
17597 VShiftOpc = ARMISD::VRSHRNIMM;
17598 break;
17599 case Intrinsic::arm_neon_vqshifts:
17600 VShiftOpc = ARMISD::VQSHLsIMM;
17601 break;
17602 case Intrinsic::arm_neon_vqshiftu:
17603 VShiftOpc = ARMISD::VQSHLuIMM;
17604 break;
17605 case Intrinsic::arm_neon_vqshiftsu:
17606 VShiftOpc = ARMISD::VQSHLsuIMM;
17607 break;
17608 case Intrinsic::arm_neon_vqshiftns:
17609 VShiftOpc = ARMISD::VQSHRNsIMM;
17610 break;
17611 case Intrinsic::arm_neon_vqshiftnu:
17612 VShiftOpc = ARMISD::VQSHRNuIMM;
17613 break;
17614 case Intrinsic::arm_neon_vqshiftnsu:
17615 VShiftOpc = ARMISD::VQSHRNsuIMM;
17616 break;
17617 case Intrinsic::arm_neon_vqrshiftns:
17618 VShiftOpc = ARMISD::VQRSHRNsIMM;
17619 break;
17620 case Intrinsic::arm_neon_vqrshiftnu:
17621 VShiftOpc = ARMISD::VQRSHRNuIMM;
17622 break;
17623 case Intrinsic::arm_neon_vqrshiftnsu:
17624 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17625 break;
17626 }
17627
17628 SDLoc dl(N);
17629 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17630 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17631 }
17632
17633 case Intrinsic::arm_neon_vshiftins: {
17634 EVT VT = N->getOperand(1).getValueType();
17635 int64_t Cnt;
17636 unsigned VShiftOpc = 0;
17637
17638 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17639 VShiftOpc = ARMISD::VSLIIMM;
17640 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17641 VShiftOpc = ARMISD::VSRIIMM;
17642 else {
17643 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17644 }
17645
17646 SDLoc dl(N);
17647 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17648 N->getOperand(1), N->getOperand(2),
17649 DAG.getConstant(Cnt, dl, MVT::i32));
17650 }
17651
17652 case Intrinsic::arm_neon_vqrshifts:
17653 case Intrinsic::arm_neon_vqrshiftu:
17654 // No immediate versions of these to check for.
17655 break;
17656
17657 case Intrinsic::arm_neon_vbsl: {
17658 SDLoc dl(N);
17659 return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
17660 N->getOperand(2), N->getOperand(3));
17661 }
17662 case Intrinsic::arm_mve_vqdmlah:
17663 case Intrinsic::arm_mve_vqdmlash:
17664 case Intrinsic::arm_mve_vqrdmlah:
17665 case Intrinsic::arm_mve_vqrdmlash:
17666 case Intrinsic::arm_mve_vmla_n_predicated:
17667 case Intrinsic::arm_mve_vmlas_n_predicated:
17668 case Intrinsic::arm_mve_vqdmlah_predicated:
17669 case Intrinsic::arm_mve_vqdmlash_predicated:
17670 case Intrinsic::arm_mve_vqrdmlah_predicated:
17671 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17672 // These intrinsics all take an i32 scalar operand which is narrowed to the
17673 // size of a single lane of the vector type they return. So we don't need
17674 // any bits of that operand above that point, which allows us to eliminate
17675 // uxth/sxth.
17676 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17677 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17678 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17679 return SDValue();
17680 break;
17681 }
17682
17683 case Intrinsic::arm_mve_minv:
17684 case Intrinsic::arm_mve_maxv:
17685 case Intrinsic::arm_mve_minav:
17686 case Intrinsic::arm_mve_maxav:
17687 case Intrinsic::arm_mve_minv_predicated:
17688 case Intrinsic::arm_mve_maxv_predicated:
17689 case Intrinsic::arm_mve_minav_predicated:
17690 case Intrinsic::arm_mve_maxav_predicated: {
17691 // These intrinsics all take an i32 scalar operand which is narrowed to the
17692 // size of a single lane of the vector type they take as the other input.
17693 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17694 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17695 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17696 return SDValue();
17697 break;
17698 }
17699
17700 case Intrinsic::arm_mve_addv: {
17701 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17702 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17703 bool Unsigned = N->getConstantOperandVal(2);
17704 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
17705 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17706 }
17707
17708 case Intrinsic::arm_mve_addlv:
17709 case Intrinsic::arm_mve_addlv_predicated: {
17710 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17711 // which recombines the two outputs into an i64
17712 bool Unsigned = N->getConstantOperandVal(2);
17713 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17716
17718 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17719 if (i != 2) // skip the unsigned flag
17720 Ops.push_back(N->getOperand(i));
17721
17722 SDLoc dl(N);
17723 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17724 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17725 val.getValue(1));
17726 }
17727 }
17728
17729 return SDValue();
17730}
17731
17732/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17733/// lowers them. As with the vector shift intrinsics, this is done during DAG
17734/// combining instead of DAG legalizing because the build_vectors for 64-bit
17735/// vector element shift counts are generally not legal, and it is hard to see
17736/// their values after they get legalized to loads from a constant pool.
17739 const ARMSubtarget *ST) {
17740 SelectionDAG &DAG = DCI.DAG;
17741 EVT VT = N->getValueType(0);
17742
17743 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17744 N->getOperand(0)->getOpcode() == ISD::AND &&
17745 N->getOperand(0)->hasOneUse()) {
17746 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17747 return SDValue();
17748 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17749 // usually show up because instcombine prefers to canonicalize it to
17750 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17751 // out of GEP lowering in some cases.
17752 SDValue N0 = N->getOperand(0);
17753 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17754 if (!ShiftAmtNode)
17755 return SDValue();
17756 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17757 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17758 if (!AndMaskNode)
17759 return SDValue();
17760 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17761 // Don't transform uxtb/uxth.
17762 if (AndMask == 255 || AndMask == 65535)
17763 return SDValue();
17764 if (isMask_32(AndMask)) {
17765 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17766 if (MaskedBits > ShiftAmt) {
17767 SDLoc DL(N);
17768 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17769 DAG.getConstant(MaskedBits, DL, MVT::i32));
17770 return DAG.getNode(
17771 ISD::SRL, DL, MVT::i32, SHL,
17772 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17773 }
17774 }
17775 }
17776
17777 // Nothing to be done for scalar shifts.
17778 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17779 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17780 return SDValue();
17781 if (ST->hasMVEIntegerOps())
17782 return SDValue();
17783
17784 int64_t Cnt;
17785
17786 switch (N->getOpcode()) {
17787 default: llvm_unreachable("unexpected shift opcode");
17788
17789 case ISD::SHL:
17790 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17791 SDLoc dl(N);
17792 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17793 DAG.getConstant(Cnt, dl, MVT::i32));
17794 }
17795 break;
17796
17797 case ISD::SRA:
17798 case ISD::SRL:
17799 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17800 unsigned VShiftOpc =
17801 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17802 SDLoc dl(N);
17803 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17804 DAG.getConstant(Cnt, dl, MVT::i32));
17805 }
17806 }
17807 return SDValue();
17808}
17809
17810// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17811// split into multiple extending loads, which are simpler to deal with than an
17812// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17813// to convert the type to an f32.
17815 SDValue N0 = N->getOperand(0);
17816 if (N0.getOpcode() != ISD::LOAD)
17817 return SDValue();
17819 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17820 LD->getExtensionType() != ISD::NON_EXTLOAD)
17821 return SDValue();
17822 EVT FromVT = LD->getValueType(0);
17823 EVT ToVT = N->getValueType(0);
17824 if (!ToVT.isVector())
17825 return SDValue();
17827 EVT ToEltVT = ToVT.getVectorElementType();
17828 EVT FromEltVT = FromVT.getVectorElementType();
17829
17830 unsigned NumElements = 0;
17831 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17832 NumElements = 4;
17833 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17834 NumElements = 4;
17835 if (NumElements == 0 ||
17836 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17837 FromVT.getVectorNumElements() % NumElements != 0 ||
17838 !isPowerOf2_32(NumElements))
17839 return SDValue();
17840
17841 LLVMContext &C = *DAG.getContext();
17842 SDLoc DL(LD);
17843 // Details about the old load
17844 SDValue Ch = LD->getChain();
17845 SDValue BasePtr = LD->getBasePtr();
17846 Align Alignment = LD->getOriginalAlign();
17847 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17848 AAMDNodes AAInfo = LD->getAAInfo();
17849
17850 ISD::LoadExtType NewExtType =
17851 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17852 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17853 EVT NewFromVT = EVT::getVectorVT(
17854 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17855 EVT NewToVT = EVT::getVectorVT(
17856 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17857
17860 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17861 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17862 SDValue NewPtr =
17863 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17864
17865 SDValue NewLoad =
17866 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17867 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17868 Alignment, MMOFlags, AAInfo);
17869 Loads.push_back(NewLoad);
17870 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17871 }
17872
17873 // Float truncs need to extended with VCVTB's into their floating point types.
17874 if (FromEltVT == MVT::f16) {
17876
17877 for (unsigned i = 0; i < Loads.size(); i++) {
17878 SDValue LoadBC =
17879 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17880 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17881 DAG.getConstant(0, DL, MVT::i32));
17882 Extends.push_back(FPExt);
17883 }
17884
17885 Loads = Extends;
17886 }
17887
17888 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17889 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17890 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
17891}
17892
17893/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
17894/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
17896 const ARMSubtarget *ST) {
17897 SDValue N0 = N->getOperand(0);
17898
17899 // Check for sign- and zero-extensions of vector extract operations of 8- and
17900 // 16-bit vector elements. NEON and MVE support these directly. They are
17901 // handled during DAG combining because type legalization will promote them
17902 // to 32-bit types and it is messy to recognize the operations after that.
17903 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
17905 SDValue Vec = N0.getOperand(0);
17906 SDValue Lane = N0.getOperand(1);
17907 EVT VT = N->getValueType(0);
17908 EVT EltVT = N0.getValueType();
17909 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17910
17911 if (VT == MVT::i32 &&
17912 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
17913 TLI.isTypeLegal(Vec.getValueType()) &&
17914 isa<ConstantSDNode>(Lane)) {
17915
17916 unsigned Opc = 0;
17917 switch (N->getOpcode()) {
17918 default: llvm_unreachable("unexpected opcode");
17919 case ISD::SIGN_EXTEND:
17920 Opc = ARMISD::VGETLANEs;
17921 break;
17922 case ISD::ZERO_EXTEND:
17923 case ISD::ANY_EXTEND:
17924 Opc = ARMISD::VGETLANEu;
17925 break;
17926 }
17927 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
17928 }
17929 }
17930
17931 if (ST->hasMVEIntegerOps())
17932 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17933 return NewLoad;
17934
17935 return SDValue();
17936}
17937
17939 const ARMSubtarget *ST) {
17940 if (ST->hasMVEFloatOps())
17941 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17942 return NewLoad;
17943
17944 return SDValue();
17945}
17946
17947// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
17948// constant bounds.
17950 const ARMSubtarget *Subtarget) {
17951 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
17952 !Subtarget->isThumb2())
17953 return SDValue();
17954
17955 EVT VT = Op.getValueType();
17956 SDValue Op0 = Op.getOperand(0);
17957
17958 if (VT != MVT::i32 ||
17959 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
17960 !isa<ConstantSDNode>(Op.getOperand(1)) ||
17962 return SDValue();
17963
17964 SDValue Min = Op;
17965 SDValue Max = Op0;
17966 SDValue Input = Op0.getOperand(0);
17967 if (Min.getOpcode() == ISD::SMAX)
17968 std::swap(Min, Max);
17969
17970 APInt MinC = Min.getConstantOperandAPInt(1);
17971 APInt MaxC = Max.getConstantOperandAPInt(1);
17972
17973 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
17974 !(MinC + 1).isPowerOf2())
17975 return SDValue();
17976
17977 SDLoc DL(Op);
17978 if (MinC == ~MaxC)
17979 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
17980 DAG.getConstant(MinC.countr_one(), DL, VT));
17981 if (MaxC == 0)
17982 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
17983 DAG.getConstant(MinC.countr_one(), DL, VT));
17984
17985 return SDValue();
17986}
17987
17988/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
17989/// saturates.
17991 const ARMSubtarget *ST) {
17992 EVT VT = N->getValueType(0);
17993 SDValue N0 = N->getOperand(0);
17994
17995 if (VT == MVT::i32)
17996 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
17997
17998 if (!ST->hasMVEIntegerOps())
17999 return SDValue();
18000
18001 if (SDValue V = PerformVQDMULHCombine(N, DAG))
18002 return V;
18003
18004 if (VT != MVT::v4i32 && VT != MVT::v8i16)
18005 return SDValue();
18006
18007 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
18008 // Check one is a smin and the other is a smax
18009 if (Min->getOpcode() != ISD::SMIN)
18010 std::swap(Min, Max);
18011 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
18012 return false;
18013
18014 APInt SaturateC;
18015 if (VT == MVT::v4i32)
18016 SaturateC = APInt(32, (1 << 15) - 1, true);
18017 else //if (VT == MVT::v8i16)
18018 SaturateC = APInt(16, (1 << 7) - 1, true);
18019
18020 APInt MinC, MaxC;
18021 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18022 MinC != SaturateC)
18023 return false;
18024 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
18025 MaxC != ~SaturateC)
18026 return false;
18027 return true;
18028 };
18029
18030 if (IsSignedSaturate(N, N0.getNode())) {
18031 SDLoc DL(N);
18032 MVT ExtVT, HalfVT;
18033 if (VT == MVT::v4i32) {
18034 HalfVT = MVT::v8i16;
18035 ExtVT = MVT::v4i16;
18036 } else { // if (VT == MVT::v8i16)
18037 HalfVT = MVT::v16i8;
18038 ExtVT = MVT::v8i8;
18039 }
18040
18041 // Create a VQMOVNB with undef top lanes, then signed extended into the top
18042 // half. That extend will hopefully be removed if only the bottom bits are
18043 // demanded (though a truncating store, for example).
18044 SDValue VQMOVN =
18045 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
18046 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
18047 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18048 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
18049 DAG.getValueType(ExtVT));
18050 }
18051
18052 auto IsUnsignedSaturate = [&](SDNode *Min) {
18053 // For unsigned, we just need to check for <= 0xffff
18054 if (Min->getOpcode() != ISD::UMIN)
18055 return false;
18056
18057 APInt SaturateC;
18058 if (VT == MVT::v4i32)
18059 SaturateC = APInt(32, (1 << 16) - 1, true);
18060 else //if (VT == MVT::v8i16)
18061 SaturateC = APInt(16, (1 << 8) - 1, true);
18062
18063 APInt MinC;
18064 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18065 MinC != SaturateC)
18066 return false;
18067 return true;
18068 };
18069
18070 if (IsUnsignedSaturate(N)) {
18071 SDLoc DL(N);
18072 MVT HalfVT;
18073 unsigned ExtConst;
18074 if (VT == MVT::v4i32) {
18075 HalfVT = MVT::v8i16;
18076 ExtConst = 0x0000FFFF;
18077 } else { //if (VT == MVT::v8i16)
18078 HalfVT = MVT::v16i8;
18079 ExtConst = 0x00FF;
18080 }
18081
18082 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
18083 // an AND. That extend will hopefully be removed if only the bottom bits are
18084 // demanded (though a truncating store, for example).
18085 SDValue VQMOVN =
18086 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
18087 DAG.getConstant(0, DL, MVT::i32));
18088 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18089 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
18090 DAG.getConstant(ExtConst, DL, VT));
18091 }
18092
18093 return SDValue();
18094}
18095
18098 if (!C)
18099 return nullptr;
18100 const APInt *CV = &C->getAPIntValue();
18101 return CV->isPowerOf2() ? CV : nullptr;
18102}
18103
18105 // If we have a CMOV, OR and AND combination such as:
18106 // if (x & CN)
18107 // y |= CM;
18108 //
18109 // And:
18110 // * CN is a single bit;
18111 // * All bits covered by CM are known zero in y
18112 //
18113 // Then we can convert this into a sequence of BFI instructions. This will
18114 // always be a win if CM is a single bit, will always be no worse than the
18115 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
18116 // three bits (due to the extra IT instruction).
18117
18118 SDValue Op0 = CMOV->getOperand(0);
18119 SDValue Op1 = CMOV->getOperand(1);
18120 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
18121 SDValue CmpZ = CMOV->getOperand(4);
18122
18123 // The compare must be against zero.
18124 if (!isNullConstant(CmpZ->getOperand(1)))
18125 return SDValue();
18126
18127 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
18128 SDValue And = CmpZ->getOperand(0);
18129 if (And->getOpcode() != ISD::AND)
18130 return SDValue();
18131 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
18132 if (!AndC)
18133 return SDValue();
18134 SDValue X = And->getOperand(0);
18135
18136 if (CC == ARMCC::EQ) {
18137 // We're performing an "equal to zero" compare. Swap the operands so we
18138 // canonicalize on a "not equal to zero" compare.
18139 std::swap(Op0, Op1);
18140 } else {
18141 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
18142 }
18143
18144 if (Op1->getOpcode() != ISD::OR)
18145 return SDValue();
18146
18148 if (!OrC)
18149 return SDValue();
18150 SDValue Y = Op1->getOperand(0);
18151
18152 if (Op0 != Y)
18153 return SDValue();
18154
18155 // Now, is it profitable to continue?
18156 APInt OrCI = OrC->getAPIntValue();
18157 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
18158 if (OrCI.popcount() > Heuristic)
18159 return SDValue();
18160
18161 // Lastly, can we determine that the bits defined by OrCI
18162 // are zero in Y?
18163 KnownBits Known = DAG.computeKnownBits(Y);
18164 if ((OrCI & Known.Zero) != OrCI)
18165 return SDValue();
18166
18167 // OK, we can do the combine.
18168 SDValue V = Y;
18169 SDLoc dl(X);
18170 EVT VT = X.getValueType();
18171 unsigned BitInX = AndC->logBase2();
18172
18173 if (BitInX != 0) {
18174 // We must shift X first.
18175 X = DAG.getNode(ISD::SRL, dl, VT, X,
18176 DAG.getConstant(BitInX, dl, VT));
18177 }
18178
18179 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
18180 BitInY < NumActiveBits; ++BitInY) {
18181 if (OrCI[BitInY] == 0)
18182 continue;
18183 APInt Mask(VT.getSizeInBits(), 0);
18184 Mask.setBit(BitInY);
18185 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
18186 // Confusingly, the operand is an *inverted* mask.
18187 DAG.getConstant(~Mask, dl, VT));
18188 }
18189
18190 return V;
18191}
18192
18193// Given N, the value controlling the conditional branch, search for the loop
18194// intrinsic, returning it, along with how the value is used. We need to handle
18195// patterns such as the following:
18196// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
18197// (brcond (setcc (loop.decrement), 0, eq), exit)
18198// (brcond (setcc (loop.decrement), 0, ne), header)
18200 bool &Negate) {
18201 switch (N->getOpcode()) {
18202 default:
18203 break;
18204 case ISD::XOR: {
18205 if (!isa<ConstantSDNode>(N.getOperand(1)))
18206 return SDValue();
18207 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
18208 return SDValue();
18209 Negate = !Negate;
18210 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
18211 }
18212 case ISD::SETCC: {
18213 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
18214 if (!Const)
18215 return SDValue();
18216 if (Const->isZero())
18217 Imm = 0;
18218 else if (Const->isOne())
18219 Imm = 1;
18220 else
18221 return SDValue();
18222 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18223 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18224 }
18226 unsigned IntOp = N.getConstantOperandVal(1);
18227 if (IntOp != Intrinsic::test_start_loop_iterations &&
18228 IntOp != Intrinsic::loop_decrement_reg)
18229 return SDValue();
18230 return N;
18231 }
18232 }
18233 return SDValue();
18234}
18235
18238 const ARMSubtarget *ST) {
18239
18240 // The hwloop intrinsics that we're interested are used for control-flow,
18241 // either for entering or exiting the loop:
18242 // - test.start.loop.iterations will test whether its operand is zero. If it
18243 // is zero, the proceeding branch should not enter the loop.
18244 // - loop.decrement.reg also tests whether its operand is zero. If it is
18245 // zero, the proceeding branch should not branch back to the beginning of
18246 // the loop.
18247 // So here, we need to check that how the brcond is using the result of each
18248 // of the intrinsics to ensure that we're branching to the right place at the
18249 // right time.
18250
18252 SDValue Cond;
18253 int Imm = 1;
18254 bool Negate = false;
18255 SDValue Chain = N->getOperand(0);
18256 SDValue Dest;
18257
18258 if (N->getOpcode() == ISD::BRCOND) {
18259 CC = ISD::SETEQ;
18260 Cond = N->getOperand(1);
18261 Dest = N->getOperand(2);
18262 } else {
18263 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18264 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18265 Cond = N->getOperand(2);
18266 Dest = N->getOperand(4);
18267 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18268 if (!Const->isOne() && !Const->isZero())
18269 return SDValue();
18270 Imm = Const->getZExtValue();
18271 } else
18272 return SDValue();
18273 }
18274
18275 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18276 if (!Int)
18277 return SDValue();
18278
18279 if (Negate)
18280 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18281
18282 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18283 return (CC == ISD::SETEQ && Imm == 0) ||
18284 (CC == ISD::SETNE && Imm == 1) ||
18285 (CC == ISD::SETLT && Imm == 1) ||
18286 (CC == ISD::SETULT && Imm == 1);
18287 };
18288
18289 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18290 return (CC == ISD::SETEQ && Imm == 1) ||
18291 (CC == ISD::SETNE && Imm == 0) ||
18292 (CC == ISD::SETGT && Imm == 0) ||
18293 (CC == ISD::SETUGT && Imm == 0) ||
18294 (CC == ISD::SETGE && Imm == 1) ||
18295 (CC == ISD::SETUGE && Imm == 1);
18296 };
18297
18298 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18299 "unsupported condition");
18300
18301 SDLoc dl(Int);
18302 SelectionDAG &DAG = DCI.DAG;
18303 SDValue Elements = Int.getOperand(2);
18304 unsigned IntOp = Int->getConstantOperandVal(1);
18305 assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
18306 && "expected single br user");
18307 SDNode *Br = *N->use_begin();
18308 SDValue OtherTarget = Br->getOperand(1);
18309
18310 // Update the unconditional branch to branch to the given Dest.
18311 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18312 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18313 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18314 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18315 };
18316
18317 if (IntOp == Intrinsic::test_start_loop_iterations) {
18318 SDValue Res;
18319 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18320 // We expect this 'instruction' to branch when the counter is zero.
18321 if (IsTrueIfZero(CC, Imm)) {
18322 SDValue Ops[] = {Chain, Setup, Dest};
18323 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18324 } else {
18325 // The logic is the reverse of what we need for WLS, so find the other
18326 // basic block target: the target of the proceeding br.
18327 UpdateUncondBr(Br, Dest, DAG);
18328
18329 SDValue Ops[] = {Chain, Setup, OtherTarget};
18330 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18331 }
18332 // Update LR count to the new value
18333 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18334 // Update chain
18335 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18336 return Res;
18337 } else {
18338 SDValue Size =
18339 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18340 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18341 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18342 DAG.getVTList(MVT::i32, MVT::Other), Args);
18343 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18344
18345 // We expect this instruction to branch when the count is not zero.
18346 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18347
18348 // Update the unconditional branch to target the loop preheader if we've
18349 // found the condition has been reversed.
18350 if (Target == OtherTarget)
18351 UpdateUncondBr(Br, Dest, DAG);
18352
18353 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18354 SDValue(LoopDec.getNode(), 1), Chain);
18355
18356 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18357 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18358 }
18359 return SDValue();
18360}
18361
18362/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18363SDValue
18365 SDValue Cmp = N->getOperand(4);
18366 if (Cmp.getOpcode() != ARMISD::CMPZ)
18367 // Only looking at NE cases.
18368 return SDValue();
18369
18370 EVT VT = N->getValueType(0);
18371 SDLoc dl(N);
18372 SDValue LHS = Cmp.getOperand(0);
18373 SDValue RHS = Cmp.getOperand(1);
18374 SDValue Chain = N->getOperand(0);
18375 SDValue BB = N->getOperand(1);
18376 SDValue ARMcc = N->getOperand(2);
18378
18379 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
18380 // -> (brcond Chain BB CC CPSR Cmp)
18381 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18382 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18383 LHS->getOperand(0)->hasOneUse() &&
18384 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18385 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18386 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18387 return DAG.getNode(
18388 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
18389 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
18390 }
18391
18392 return SDValue();
18393}
18394
18395/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18396SDValue
18398 SDValue Cmp = N->getOperand(4);
18399 if (Cmp.getOpcode() != ARMISD::CMPZ)
18400 // Only looking at EQ and NE cases.
18401 return SDValue();
18402
18403 EVT VT = N->getValueType(0);
18404 SDLoc dl(N);
18405 SDValue LHS = Cmp.getOperand(0);
18406 SDValue RHS = Cmp.getOperand(1);
18407 SDValue FalseVal = N->getOperand(0);
18408 SDValue TrueVal = N->getOperand(1);
18409 SDValue ARMcc = N->getOperand(2);
18411
18412 // BFI is only available on V6T2+.
18413 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18415 if (R)
18416 return R;
18417 }
18418
18419 // Simplify
18420 // mov r1, r0
18421 // cmp r1, x
18422 // mov r0, y
18423 // moveq r0, x
18424 // to
18425 // cmp r0, x
18426 // movne r0, y
18427 //
18428 // mov r1, r0
18429 // cmp r1, x
18430 // mov r0, x
18431 // movne r0, y
18432 // to
18433 // cmp r0, x
18434 // movne r0, y
18435 /// FIXME: Turn this into a target neutral optimization?
18436 SDValue Res;
18437 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18438 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
18439 N->getOperand(3), Cmp);
18440 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18441 SDValue ARMcc;
18442 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18443 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
18444 N->getOperand(3), NewCmp);
18445 }
18446
18447 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
18448 // -> (cmov F T CC CPSR Cmp)
18449 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18450 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18451 isNullConstant(RHS)) {
18452 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18453 LHS->getOperand(2), LHS->getOperand(3),
18454 LHS->getOperand(4));
18455 }
18456
18457 if (!VT.isInteger())
18458 return SDValue();
18459
18460 // Fold away an unneccessary CMPZ/CMOV
18461 // CMOV A, B, C1, $cpsr, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18462 // if C1==EQ -> CMOV A, B, C2, $cpsr, D
18463 // if C1==NE -> CMOV A, B, NOT(C2), $cpsr, D
18464 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18465 N->getConstantOperandVal(2) == ARMCC::NE) {
18467 if (SDValue C = IsCMPZCSINC(N->getOperand(4).getNode(), Cond)) {
18468 if (N->getConstantOperandVal(2) == ARMCC::NE)
18470 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18471 N->getOperand(1),
18472 DAG.getTargetConstant(Cond, SDLoc(N), MVT::i32),
18473 N->getOperand(3), C);
18474 }
18475 }
18476
18477 // Materialize a boolean comparison for integers so we can avoid branching.
18478 if (isNullConstant(FalseVal)) {
18479 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18480 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18481 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18482 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18483 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18484 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18485 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18486 DAG.getConstant(5, dl, MVT::i32));
18487 } else {
18488 // CMOV 0, 1, ==, (CMPZ x, y) ->
18489 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18490 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18491 //
18492 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18493 // x != y. In other words, a carry C == 1 when x == y, C == 0
18494 // otherwise.
18495 // The final UADDO_CARRY computes
18496 // x - y + (0 - (x - y)) + C == C
18497 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18498 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18499 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18500 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18501 // actually.
18502 SDValue Carry =
18503 DAG.getNode(ISD::SUB, dl, MVT::i32,
18504 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18505 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18506 }
18507 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18508 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18509 // This seems pointless but will allow us to combine it further below.
18510 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18511 SDValue Sub =
18512 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18513 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
18514 Sub.getValue(1), SDValue());
18515 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18516 N->getOperand(3), CPSRGlue.getValue(1));
18517 FalseVal = Sub;
18518 }
18519 } else if (isNullConstant(TrueVal)) {
18520 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18521 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18522 // This seems pointless but will allow us to combine it further below
18523 // Note that we change == for != as this is the dual for the case above.
18524 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18525 SDValue Sub =
18526 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18527 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
18528 Sub.getValue(1), SDValue());
18529 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18530 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18531 N->getOperand(3), CPSRGlue.getValue(1));
18532 FalseVal = Sub;
18533 }
18534 }
18535
18536 // On Thumb1, the DAG above may be further combined if z is a power of 2
18537 // (z == 2 ^ K).
18538 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18539 // t1 = (USUBO (SUB x, y), 1)
18540 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18541 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18542 //
18543 // This also handles the special case of comparing against zero; it's
18544 // essentially, the same pattern, except there's no SUBC:
18545 // CMOV x, z, !=, (CMPZ x, 0) ->
18546 // t1 = (USUBO x, 1)
18547 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18548 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18549 const APInt *TrueConst;
18550 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18551 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18552 FalseVal.getOperand(1) == RHS) ||
18553 (FalseVal == LHS && isNullConstant(RHS))) &&
18554 (TrueConst = isPowerOf2Constant(TrueVal))) {
18555 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18556 unsigned ShiftAmount = TrueConst->logBase2();
18557 if (ShiftAmount)
18558 TrueVal = DAG.getConstant(1, dl, VT);
18559 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18560 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18561 Subc.getValue(1));
18562
18563 if (ShiftAmount)
18564 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18565 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18566 }
18567
18568 if (Res.getNode()) {
18569 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18570 // Capture demanded bits information that would be otherwise lost.
18571 if (Known.Zero == 0xfffffffe)
18572 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18573 DAG.getValueType(MVT::i1));
18574 else if (Known.Zero == 0xffffff00)
18575 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18576 DAG.getValueType(MVT::i8));
18577 else if (Known.Zero == 0xffff0000)
18578 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18579 DAG.getValueType(MVT::i16));
18580 }
18581
18582 return Res;
18583}
18584
18587 const ARMSubtarget *ST) {
18588 SelectionDAG &DAG = DCI.DAG;
18589 SDValue Src = N->getOperand(0);
18590 EVT DstVT = N->getValueType(0);
18591
18592 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18593 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18594 EVT SrcVT = Src.getValueType();
18595 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18596 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18597 }
18598
18599 // We may have a bitcast of something that has already had this bitcast
18600 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18601 if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
18602 Src.getOperand(0).getValueType().getScalarSizeInBits() <=
18603 Src.getValueType().getScalarSizeInBits())
18604 Src = Src.getOperand(0);
18605
18606 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18607 // would be generated is at least the width of the element type.
18608 EVT SrcVT = Src.getValueType();
18609 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18610 Src.getOpcode() == ARMISD::VMVNIMM ||
18611 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18612 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18613 DAG.getDataLayout().isBigEndian())
18614 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18615
18616 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18617 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18618 return R;
18619
18620 return SDValue();
18621}
18622
18623// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18624// node into stack operations after legalizeOps.
18627 SelectionDAG &DAG = DCI.DAG;
18628 EVT VT = N->getValueType(0);
18629 SDLoc DL(N);
18630
18631 // MVETrunc(Undef, Undef) -> Undef
18632 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18633 return DAG.getUNDEF(VT);
18634
18635 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18636 if (N->getNumOperands() == 2 &&
18637 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18638 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18639 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18640 N->getOperand(0).getOperand(1),
18641 N->getOperand(1).getOperand(0),
18642 N->getOperand(1).getOperand(1));
18643
18644 // MVETrunc(shuffle, shuffle) -> VMOVN
18645 if (N->getNumOperands() == 2 &&
18646 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18647 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18648 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18649 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18650
18651 if (S0->getOperand(0) == S1->getOperand(0) &&
18652 S0->getOperand(1) == S1->getOperand(1)) {
18653 // Construct complete shuffle mask
18654 SmallVector<int, 8> Mask(S0->getMask());
18655 Mask.append(S1->getMask().begin(), S1->getMask().end());
18656
18657 if (isVMOVNTruncMask(Mask, VT, false))
18658 return DAG.getNode(
18659 ARMISD::VMOVN, DL, VT,
18660 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18661 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18662 DAG.getConstant(1, DL, MVT::i32));
18663 if (isVMOVNTruncMask(Mask, VT, true))
18664 return DAG.getNode(
18665 ARMISD::VMOVN, DL, VT,
18666 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18667 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18668 DAG.getConstant(1, DL, MVT::i32));
18669 }
18670 }
18671
18672 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18673 // truncate to a buildvector to allow the generic optimisations to kick in.
18674 if (all_of(N->ops(), [](SDValue Op) {
18675 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18676 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18677 (Op.getOpcode() == ISD::BITCAST &&
18678 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18679 })) {
18680 SmallVector<SDValue, 8> Extracts;
18681 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18682 SDValue O = N->getOperand(Op);
18683 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18684 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18685 DAG.getConstant(i, DL, MVT::i32));
18686 Extracts.push_back(Ext);
18687 }
18688 }
18689 return DAG.getBuildVector(VT, DL, Extracts);
18690 }
18691
18692 // If we are late in the legalization process and nothing has optimised
18693 // the trunc to anything better, lower it to a stack store and reload,
18694 // performing the truncation whilst keeping the lanes in the correct order:
18695 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18696 if (!DCI.isAfterLegalizeDAG())
18697 return SDValue();
18698
18699 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18700 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18701 int NumIns = N->getNumOperands();
18702 assert((NumIns == 2 || NumIns == 4) &&
18703 "Expected 2 or 4 inputs to an MVETrunc");
18704 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18705 if (N->getNumOperands() == 4)
18706 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18707
18708 SmallVector<SDValue> Chains;
18709 for (int I = 0; I < NumIns; I++) {
18710 SDValue Ptr = DAG.getNode(
18711 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18712 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18714 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18715 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18716 Ptr, MPI, StoreVT, Align(4));
18717 Chains.push_back(Ch);
18718 }
18719
18720 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18721 MachinePointerInfo MPI =
18723 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18724}
18725
18726// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18728 SelectionDAG &DAG) {
18729 SDValue N0 = N->getOperand(0);
18731 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18732 return SDValue();
18733
18734 EVT FromVT = LD->getMemoryVT();
18735 EVT ToVT = N->getValueType(0);
18736 if (!ToVT.isVector())
18737 return SDValue();
18738 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18739 EVT ToEltVT = ToVT.getVectorElementType();
18740 EVT FromEltVT = FromVT.getVectorElementType();
18741
18742 unsigned NumElements = 0;
18743 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18744 NumElements = 4;
18745 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18746 NumElements = 8;
18747 assert(NumElements != 0);
18748
18749 ISD::LoadExtType NewExtType =
18750 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18751 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18752 LD->getExtensionType() != ISD::EXTLOAD &&
18753 LD->getExtensionType() != NewExtType)
18754 return SDValue();
18755
18756 LLVMContext &C = *DAG.getContext();
18757 SDLoc DL(LD);
18758 // Details about the old load
18759 SDValue Ch = LD->getChain();
18760 SDValue BasePtr = LD->getBasePtr();
18761 Align Alignment = LD->getOriginalAlign();
18762 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18763 AAMDNodes AAInfo = LD->getAAInfo();
18764
18765 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18766 EVT NewFromVT = EVT::getVectorVT(
18767 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18768 EVT NewToVT = EVT::getVectorVT(
18769 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18770
18773 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18774 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18775 SDValue NewPtr =
18776 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18777
18778 SDValue NewLoad =
18779 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18780 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18781 Alignment, MMOFlags, AAInfo);
18782 Loads.push_back(NewLoad);
18783 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18784 }
18785
18786 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18787 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18788 return DAG.getMergeValues(Loads, DL);
18789}
18790
18791// Perform combines for MVEEXT. If it has not be optimized to anything better
18792// before lowering, it gets converted to stack store and extloads performing the
18793// extend whilst still keeping the same lane ordering.
18796 SelectionDAG &DAG = DCI.DAG;
18797 EVT VT = N->getValueType(0);
18798 SDLoc DL(N);
18799 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18800 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18801
18802 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18803 *DAG.getContext());
18804 auto Extend = [&](SDValue V) {
18805 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18806 return N->getOpcode() == ARMISD::MVESEXT
18807 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18808 DAG.getValueType(ExtVT))
18809 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18810 };
18811
18812 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18813 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18814 SDValue Ext = Extend(N->getOperand(0));
18815 return DAG.getMergeValues({Ext, Ext}, DL);
18816 }
18817
18818 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18819 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18820 ArrayRef<int> Mask = SVN->getMask();
18821 assert(Mask.size() == 2 * VT.getVectorNumElements());
18822 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18823 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18824 SDValue Op0 = SVN->getOperand(0);
18825 SDValue Op1 = SVN->getOperand(1);
18826
18827 auto CheckInregMask = [&](int Start, int Offset) {
18828 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18829 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18830 return false;
18831 return true;
18832 };
18833 SDValue V0 = SDValue(N, 0);
18834 SDValue V1 = SDValue(N, 1);
18835 if (CheckInregMask(0, 0))
18836 V0 = Extend(Op0);
18837 else if (CheckInregMask(0, 1))
18838 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18839 else if (CheckInregMask(0, Mask.size()))
18840 V0 = Extend(Op1);
18841 else if (CheckInregMask(0, Mask.size() + 1))
18842 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18843
18844 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18845 V1 = Extend(Op1);
18846 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18847 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18848 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18849 V1 = Extend(Op0);
18850 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18851 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18852
18853 if (V0.getNode() != N || V1.getNode() != N)
18854 return DAG.getMergeValues({V0, V1}, DL);
18855 }
18856
18857 // MVEEXT(load) -> extload, extload
18858 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
18860 return L;
18861
18862 if (!DCI.isAfterLegalizeDAG())
18863 return SDValue();
18864
18865 // Lower to a stack store and reload:
18866 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
18867 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18868 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18869 int NumOuts = N->getNumValues();
18870 assert((NumOuts == 2 || NumOuts == 4) &&
18871 "Expected 2 or 4 outputs to an MVEEXT");
18872 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18873 *DAG.getContext());
18874 if (N->getNumOperands() == 4)
18875 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
18876
18877 MachinePointerInfo MPI =
18879 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
18880 StackPtr, MPI, Align(4));
18881
18883 for (int I = 0; I < NumOuts; I++) {
18884 SDValue Ptr = DAG.getNode(
18885 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18886 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
18888 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
18889 SDValue Load = DAG.getExtLoad(
18890 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
18891 VT, Chain, Ptr, MPI, LoadVT, Align(4));
18892 Loads.push_back(Load);
18893 }
18894
18895 return DAG.getMergeValues(Loads, DL);
18896}
18897
18899 DAGCombinerInfo &DCI) const {
18900 switch (N->getOpcode()) {
18901 default: break;
18902 case ISD::SELECT_CC:
18903 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
18904 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
18905 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
18906 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
18907 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
18908 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
18909 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
18910 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
18911 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
18912 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
18913 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
18914 case ISD::BRCOND:
18915 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
18916 case ARMISD::ADDC:
18917 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
18918 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
18919 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
18920 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
18921 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
18922 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
18923 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
18924 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
18925 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
18928 return PerformExtractEltCombine(N, DCI, Subtarget);
18932 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
18933 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
18934 case ISD::FP_TO_SINT:
18935 case ISD::FP_TO_UINT:
18936 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
18937 case ISD::FADD:
18938 return PerformFADDCombine(N, DCI.DAG, Subtarget);
18939 case ISD::FMUL:
18940 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
18942 return PerformIntrinsicCombine(N, DCI);
18943 case ISD::SHL:
18944 case ISD::SRA:
18945 case ISD::SRL:
18946 return PerformShiftCombine(N, DCI, Subtarget);
18947 case ISD::SIGN_EXTEND:
18948 case ISD::ZERO_EXTEND:
18949 case ISD::ANY_EXTEND:
18950 return PerformExtendCombine(N, DCI.DAG, Subtarget);
18951 case ISD::FP_EXTEND:
18952 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
18953 case ISD::SMIN:
18954 case ISD::UMIN:
18955 case ISD::SMAX:
18956 case ISD::UMAX:
18957 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
18958 case ARMISD::CMOV:
18959 return PerformCMOVCombine(N, DCI.DAG);
18960 case ARMISD::BRCOND:
18961 return PerformBRCONDCombine(N, DCI.DAG);
18962 case ARMISD::CMPZ:
18963 return PerformCMPZCombine(N, DCI.DAG);
18964 case ARMISD::CSINC:
18965 case ARMISD::CSINV:
18966 case ARMISD::CSNEG:
18967 return PerformCSETCombine(N, DCI.DAG);
18968 case ISD::LOAD:
18969 return PerformLOADCombine(N, DCI, Subtarget);
18970 case ARMISD::VLD1DUP:
18971 case ARMISD::VLD2DUP:
18972 case ARMISD::VLD3DUP:
18973 case ARMISD::VLD4DUP:
18974 return PerformVLDCombine(N, DCI);
18976 return PerformARMBUILD_VECTORCombine(N, DCI);
18977 case ISD::BITCAST:
18978 return PerformBITCASTCombine(N, DCI, Subtarget);
18980 return PerformPREDICATE_CASTCombine(N, DCI);
18982 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
18983 case ARMISD::MVETRUNC:
18984 return PerformMVETruncCombine(N, DCI);
18985 case ARMISD::MVESEXT:
18986 case ARMISD::MVEZEXT:
18987 return PerformMVEExtCombine(N, DCI);
18988 case ARMISD::VCMP:
18989 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
18990 case ISD::VECREDUCE_ADD:
18991 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
18992 case ARMISD::VADDVs:
18993 case ARMISD::VADDVu:
18994 case ARMISD::VADDLVs:
18995 case ARMISD::VADDLVu:
18996 case ARMISD::VADDLVAs:
18997 case ARMISD::VADDLVAu:
18998 case ARMISD::VMLAVs:
18999 case ARMISD::VMLAVu:
19000 case ARMISD::VMLALVs:
19001 case ARMISD::VMLALVu:
19002 case ARMISD::VMLALVAs:
19003 case ARMISD::VMLALVAu:
19004 return PerformReduceShuffleCombine(N, DCI.DAG);
19005 case ARMISD::VMOVN:
19006 return PerformVMOVNCombine(N, DCI);
19007 case ARMISD::VQMOVNs:
19008 case ARMISD::VQMOVNu:
19009 return PerformVQMOVNCombine(N, DCI);
19010 case ARMISD::VQDMULH:
19011 return PerformVQDMULHCombine(N, DCI);
19012 case ARMISD::ASRL:
19013 case ARMISD::LSRL:
19014 case ARMISD::LSLL:
19015 return PerformLongShiftCombine(N, DCI.DAG);
19016 case ARMISD::SMULWB: {
19017 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19018 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19019 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19020 return SDValue();
19021 break;
19022 }
19023 case ARMISD::SMULWT: {
19024 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19025 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19026 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19027 return SDValue();
19028 break;
19029 }
19030 case ARMISD::SMLALBB:
19031 case ARMISD::QADD16b:
19032 case ARMISD::QSUB16b:
19033 case ARMISD::UQADD16b:
19034 case ARMISD::UQSUB16b: {
19035 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19036 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19037 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19038 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19039 return SDValue();
19040 break;
19041 }
19042 case ARMISD::SMLALBT: {
19043 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
19044 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19045 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
19046 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19047 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
19048 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
19049 return SDValue();
19050 break;
19051 }
19052 case ARMISD::SMLALTB: {
19053 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
19054 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19055 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
19056 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19057 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
19058 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
19059 return SDValue();
19060 break;
19061 }
19062 case ARMISD::SMLALTT: {
19063 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19064 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19065 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19066 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19067 return SDValue();
19068 break;
19069 }
19070 case ARMISD::QADD8b:
19071 case ARMISD::QSUB8b:
19072 case ARMISD::UQADD8b:
19073 case ARMISD::UQSUB8b: {
19074 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19075 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
19076 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19077 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19078 return SDValue();
19079 break;
19080 }
19081 case ARMISD::VBSP:
19082 if (N->getOperand(1) == N->getOperand(2))
19083 return N->getOperand(1);
19084 return SDValue();
19087 switch (N->getConstantOperandVal(1)) {
19088 case Intrinsic::arm_neon_vld1:
19089 case Intrinsic::arm_neon_vld1x2:
19090 case Intrinsic::arm_neon_vld1x3:
19091 case Intrinsic::arm_neon_vld1x4:
19092 case Intrinsic::arm_neon_vld2:
19093 case Intrinsic::arm_neon_vld3:
19094 case Intrinsic::arm_neon_vld4:
19095 case Intrinsic::arm_neon_vld2lane:
19096 case Intrinsic::arm_neon_vld3lane:
19097 case Intrinsic::arm_neon_vld4lane:
19098 case Intrinsic::arm_neon_vld2dup:
19099 case Intrinsic::arm_neon_vld3dup:
19100 case Intrinsic::arm_neon_vld4dup:
19101 case Intrinsic::arm_neon_vst1:
19102 case Intrinsic::arm_neon_vst1x2:
19103 case Intrinsic::arm_neon_vst1x3:
19104 case Intrinsic::arm_neon_vst1x4:
19105 case Intrinsic::arm_neon_vst2:
19106 case Intrinsic::arm_neon_vst3:
19107 case Intrinsic::arm_neon_vst4:
19108 case Intrinsic::arm_neon_vst2lane:
19109 case Intrinsic::arm_neon_vst3lane:
19110 case Intrinsic::arm_neon_vst4lane:
19111 return PerformVLDCombine(N, DCI);
19112 case Intrinsic::arm_mve_vld2q:
19113 case Intrinsic::arm_mve_vld4q:
19114 case Intrinsic::arm_mve_vst2q:
19115 case Intrinsic::arm_mve_vst4q:
19116 return PerformMVEVLDCombine(N, DCI);
19117 default: break;
19118 }
19119 break;
19120 }
19121 return SDValue();
19122}
19123
19125 EVT VT) const {
19126 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
19127}
19128
19130 Align Alignment,
19132 unsigned *Fast) const {
19133 // Depends what it gets converted into if the type is weird.
19134 if (!VT.isSimple())
19135 return false;
19136
19137 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
19138 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
19139 auto Ty = VT.getSimpleVT().SimpleTy;
19140
19141 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
19142 // Unaligned access can use (for example) LRDB, LRDH, LDR
19143 if (AllowsUnaligned) {
19144 if (Fast)
19145 *Fast = Subtarget->hasV7Ops();
19146 return true;
19147 }
19148 }
19149
19150 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
19151 // For any little-endian targets with neon, we can support unaligned ld/st
19152 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
19153 // A big-endian target may also explicitly support unaligned accesses
19154 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
19155 if (Fast)
19156 *Fast = 1;
19157 return true;
19158 }
19159 }
19160
19161 if (!Subtarget->hasMVEIntegerOps())
19162 return false;
19163
19164 // These are for predicates
19165 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
19166 Ty == MVT::v2i1)) {
19167 if (Fast)
19168 *Fast = 1;
19169 return true;
19170 }
19171
19172 // These are for truncated stores/narrowing loads. They are fine so long as
19173 // the alignment is at least the size of the item being loaded
19174 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
19175 Alignment >= VT.getScalarSizeInBits() / 8) {
19176 if (Fast)
19177 *Fast = true;
19178 return true;
19179 }
19180
19181 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
19182 // VSTRW.U32 all store the vector register in exactly the same format, and
19183 // differ only in the range of their immediate offset field and the required
19184 // alignment. So there is always a store that can be used, regardless of
19185 // actual type.
19186 //
19187 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
19188 // VREV64.8) pair and get the same effect. This will likely be better than
19189 // aligning the vector through the stack.
19190 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
19191 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
19192 Ty == MVT::v2f64) {
19193 if (Fast)
19194 *Fast = 1;
19195 return true;
19196 }
19197
19198 return false;
19199}
19200
19201
19203 const MemOp &Op, const AttributeList &FuncAttributes) const {
19204 // See if we can use NEON instructions for this...
19205 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
19206 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
19207 unsigned Fast;
19208 if (Op.size() >= 16 &&
19209 (Op.isAligned(Align(16)) ||
19210 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
19212 Fast))) {
19213 return MVT::v2f64;
19214 } else if (Op.size() >= 8 &&
19215 (Op.isAligned(Align(8)) ||
19217 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
19218 Fast))) {
19219 return MVT::f64;
19220 }
19221 }
19222
19223 // Let the target-independent logic figure it out.
19224 return MVT::Other;
19225}
19226
19227// 64-bit integers are split into their high and low parts and held in two
19228// different registers, so the trunc is free since the low register can just
19229// be used.
19230bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19231 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19232 return false;
19233 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19234 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19235 return (SrcBits == 64 && DestBits == 32);
19236}
19237
19239 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19240 !DstVT.isInteger())
19241 return false;
19242 unsigned SrcBits = SrcVT.getSizeInBits();
19243 unsigned DestBits = DstVT.getSizeInBits();
19244 return (SrcBits == 64 && DestBits == 32);
19245}
19246
19248 if (Val.getOpcode() != ISD::LOAD)
19249 return false;
19250
19251 EVT VT1 = Val.getValueType();
19252 if (!VT1.isSimple() || !VT1.isInteger() ||
19253 !VT2.isSimple() || !VT2.isInteger())
19254 return false;
19255
19256 switch (VT1.getSimpleVT().SimpleTy) {
19257 default: break;
19258 case MVT::i1:
19259 case MVT::i8:
19260 case MVT::i16:
19261 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19262 return true;
19263 }
19264
19265 return false;
19266}
19267
19269 if (!VT.isSimple())
19270 return false;
19271
19272 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19273 // negate values directly (fneg is free). So, we don't want to let the DAG
19274 // combiner rewrite fneg into xors and some other instructions. For f16 and
19275 // FullFP16 argument passing, some bitcast nodes may be introduced,
19276 // triggering this DAG combine rewrite, so we are avoiding that with this.
19277 switch (VT.getSimpleVT().SimpleTy) {
19278 default: break;
19279 case MVT::f16:
19280 return Subtarget->hasFullFP16();
19281 }
19282
19283 return false;
19284}
19285
19286/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
19287/// of the vector elements.
19288static bool areExtractExts(Value *Ext1, Value *Ext2) {
19289 auto areExtDoubled = [](Instruction *Ext) {
19290 return Ext->getType()->getScalarSizeInBits() ==
19291 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
19292 };
19293
19294 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
19295 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
19296 !areExtDoubled(cast<Instruction>(Ext1)) ||
19297 !areExtDoubled(cast<Instruction>(Ext2)))
19298 return false;
19299
19300 return true;
19301}
19302
19303/// Check if sinking \p I's operands to I's basic block is profitable, because
19304/// the operands can be folded into a target instruction, e.g.
19305/// sext/zext can be folded into vsubl.
19307 SmallVectorImpl<Use *> &Ops) const {
19308 if (!I->getType()->isVectorTy())
19309 return false;
19310
19311 if (Subtarget->hasNEON()) {
19312 switch (I->getOpcode()) {
19313 case Instruction::Sub:
19314 case Instruction::Add: {
19315 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
19316 return false;
19317 Ops.push_back(&I->getOperandUse(0));
19318 Ops.push_back(&I->getOperandUse(1));
19319 return true;
19320 }
19321 default:
19322 return false;
19323 }
19324 }
19325
19326 if (!Subtarget->hasMVEIntegerOps())
19327 return false;
19328
19329 auto IsFMSMul = [&](Instruction *I) {
19330 if (!I->hasOneUse())
19331 return false;
19332 auto *Sub = cast<Instruction>(*I->users().begin());
19333 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
19334 };
19335 auto IsFMS = [&](Instruction *I) {
19336 if (match(I->getOperand(0), m_FNeg(m_Value())) ||
19337 match(I->getOperand(1), m_FNeg(m_Value())))
19338 return true;
19339 return false;
19340 };
19341
19342 auto IsSinker = [&](Instruction *I, int Operand) {
19343 switch (I->getOpcode()) {
19344 case Instruction::Add:
19345 case Instruction::Mul:
19346 case Instruction::FAdd:
19347 case Instruction::ICmp:
19348 case Instruction::FCmp:
19349 return true;
19350 case Instruction::FMul:
19351 return !IsFMSMul(I);
19352 case Instruction::Sub:
19353 case Instruction::FSub:
19354 case Instruction::Shl:
19355 case Instruction::LShr:
19356 case Instruction::AShr:
19357 return Operand == 1;
19358 case Instruction::Call:
19359 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
19360 switch (II->getIntrinsicID()) {
19361 case Intrinsic::fma:
19362 return !IsFMS(I);
19363 case Intrinsic::sadd_sat:
19364 case Intrinsic::uadd_sat:
19365 case Intrinsic::arm_mve_add_predicated:
19366 case Intrinsic::arm_mve_mul_predicated:
19367 case Intrinsic::arm_mve_qadd_predicated:
19368 case Intrinsic::arm_mve_vhadd:
19369 case Intrinsic::arm_mve_hadd_predicated:
19370 case Intrinsic::arm_mve_vqdmull:
19371 case Intrinsic::arm_mve_vqdmull_predicated:
19372 case Intrinsic::arm_mve_vqdmulh:
19373 case Intrinsic::arm_mve_qdmulh_predicated:
19374 case Intrinsic::arm_mve_vqrdmulh:
19375 case Intrinsic::arm_mve_qrdmulh_predicated:
19376 case Intrinsic::arm_mve_fma_predicated:
19377 return true;
19378 case Intrinsic::ssub_sat:
19379 case Intrinsic::usub_sat:
19380 case Intrinsic::arm_mve_sub_predicated:
19381 case Intrinsic::arm_mve_qsub_predicated:
19382 case Intrinsic::arm_mve_hsub_predicated:
19383 case Intrinsic::arm_mve_vhsub:
19384 return Operand == 1;
19385 default:
19386 return false;
19387 }
19388 }
19389 return false;
19390 default:
19391 return false;
19392 }
19393 };
19394
19395 for (auto OpIdx : enumerate(I->operands())) {
19396 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
19397 // Make sure we are not already sinking this operand
19398 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
19399 continue;
19400
19401 Instruction *Shuffle = Op;
19402 if (Shuffle->getOpcode() == Instruction::BitCast)
19403 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
19404 // We are looking for a splat that can be sunk.
19405 if (!Shuffle ||
19406 !match(Shuffle, m_Shuffle(
19408 m_Undef(), m_ZeroMask())))
19409 continue;
19410 if (!IsSinker(I, OpIdx.index()))
19411 continue;
19412
19413 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
19414 // and vector registers
19415 for (Use &U : Op->uses()) {
19416 Instruction *Insn = cast<Instruction>(U.getUser());
19417 if (!IsSinker(Insn, U.getOperandNo()))
19418 return false;
19419 }
19420
19421 Ops.push_back(&Shuffle->getOperandUse(0));
19422 if (Shuffle != Op)
19423 Ops.push_back(&Op->getOperandUse(0));
19424 Ops.push_back(&OpIdx.value());
19425 }
19426 return true;
19427}
19428
19430 if (!Subtarget->hasMVEIntegerOps())
19431 return nullptr;
19432 Type *SVIType = SVI->getType();
19433 Type *ScalarType = SVIType->getScalarType();
19434
19435 if (ScalarType->isFloatTy())
19436 return Type::getInt32Ty(SVIType->getContext());
19437 if (ScalarType->isHalfTy())
19438 return Type::getInt16Ty(SVIType->getContext());
19439 return nullptr;
19440}
19441
19443 EVT VT = ExtVal.getValueType();
19444
19445 if (!isTypeLegal(VT))
19446 return false;
19447
19448 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19449 if (Ld->isExpandingLoad())
19450 return false;
19451 }
19452
19453 if (Subtarget->hasMVEIntegerOps())
19454 return true;
19455
19456 // Don't create a loadext if we can fold the extension into a wide/long
19457 // instruction.
19458 // If there's more than one user instruction, the loadext is desirable no
19459 // matter what. There can be two uses by the same instruction.
19460 if (ExtVal->use_empty() ||
19461 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
19462 return true;
19463
19464 SDNode *U = *ExtVal->use_begin();
19465 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19466 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19467 return false;
19468
19469 return true;
19470}
19471
19473 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19474 return false;
19475
19476 if (!isTypeLegal(EVT::getEVT(Ty1)))
19477 return false;
19478
19479 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19480
19481 // Assuming the caller doesn't have a zeroext or signext return parameter,
19482 // truncation all the way down to i1 is valid.
19483 return true;
19484}
19485
19486/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19487/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19488/// expanded to FMAs when this method returns true, otherwise fmuladd is
19489/// expanded to fmul + fadd.
19490///
19491/// ARM supports both fused and unfused multiply-add operations; we already
19492/// lower a pair of fmul and fadd to the latter so it's not clear that there
19493/// would be a gain or that the gain would be worthwhile enough to risk
19494/// correctness bugs.
19495///
19496/// For MVE, we set this to true as it helps simplify the need for some
19497/// patterns (and we don't have the non-fused floating point instruction).
19498bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19499 EVT VT) const {
19500 if (!VT.isSimple())
19501 return false;
19502
19503 switch (VT.getSimpleVT().SimpleTy) {
19504 case MVT::v4f32:
19505 case MVT::v8f16:
19506 return Subtarget->hasMVEFloatOps();
19507 case MVT::f16:
19508 return Subtarget->useFPVFMx16();
19509 case MVT::f32:
19510 return Subtarget->useFPVFMx();
19511 case MVT::f64:
19512 return Subtarget->useFPVFMx64();
19513 default:
19514 break;
19515 }
19516
19517 return false;
19518}
19519
19520static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19521 if (V < 0)
19522 return false;
19523
19524 unsigned Scale = 1;
19525 switch (VT.getSimpleVT().SimpleTy) {
19526 case MVT::i1:
19527 case MVT::i8:
19528 // Scale == 1;
19529 break;
19530 case MVT::i16:
19531 // Scale == 2;
19532 Scale = 2;
19533 break;
19534 default:
19535 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19536 // Scale == 4;
19537 Scale = 4;
19538 break;
19539 }
19540
19541 if ((V & (Scale - 1)) != 0)
19542 return false;
19543 return isUInt<5>(V / Scale);
19544}
19545
19546static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19547 const ARMSubtarget *Subtarget) {
19548 if (!VT.isInteger() && !VT.isFloatingPoint())
19549 return false;
19550 if (VT.isVector() && Subtarget->hasNEON())
19551 return false;
19552 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19553 !Subtarget->hasMVEFloatOps())
19554 return false;
19555
19556 bool IsNeg = false;
19557 if (V < 0) {
19558 IsNeg = true;
19559 V = -V;
19560 }
19561
19562 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19563
19564 // MVE: size * imm7
19565 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19566 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19567 case MVT::i32:
19568 case MVT::f32:
19569 return isShiftedUInt<7,2>(V);
19570 case MVT::i16:
19571 case MVT::f16:
19572 return isShiftedUInt<7,1>(V);
19573 case MVT::i8:
19574 return isUInt<7>(V);
19575 default:
19576 return false;
19577 }
19578 }
19579
19580 // half VLDR: 2 * imm8
19581 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19582 return isShiftedUInt<8, 1>(V);
19583 // VLDR and LDRD: 4 * imm8
19584 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19585 return isShiftedUInt<8, 2>(V);
19586
19587 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19588 // + imm12 or - imm8
19589 if (IsNeg)
19590 return isUInt<8>(V);
19591 return isUInt<12>(V);
19592 }
19593
19594 return false;
19595}
19596
19597/// isLegalAddressImmediate - Return true if the integer value can be used
19598/// as the offset of the target addressing mode for load / store of the
19599/// given type.
19600static bool isLegalAddressImmediate(int64_t V, EVT VT,
19601 const ARMSubtarget *Subtarget) {
19602 if (V == 0)
19603 return true;
19604
19605 if (!VT.isSimple())
19606 return false;
19607
19608 if (Subtarget->isThumb1Only())
19609 return isLegalT1AddressImmediate(V, VT);
19610 else if (Subtarget->isThumb2())
19611 return isLegalT2AddressImmediate(V, VT, Subtarget);
19612
19613 // ARM mode.
19614 if (V < 0)
19615 V = - V;
19616 switch (VT.getSimpleVT().SimpleTy) {
19617 default: return false;
19618 case MVT::i1:
19619 case MVT::i8:
19620 case MVT::i32:
19621 // +- imm12
19622 return isUInt<12>(V);
19623 case MVT::i16:
19624 // +- imm8
19625 return isUInt<8>(V);
19626 case MVT::f32:
19627 case MVT::f64:
19628 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19629 return false;
19630 return isShiftedUInt<8, 2>(V);
19631 }
19632}
19633
19635 EVT VT) const {
19636 int Scale = AM.Scale;
19637 if (Scale < 0)
19638 return false;
19639
19640 switch (VT.getSimpleVT().SimpleTy) {
19641 default: return false;
19642 case MVT::i1:
19643 case MVT::i8:
19644 case MVT::i16:
19645 case MVT::i32:
19646 if (Scale == 1)
19647 return true;
19648 // r + r << imm
19649 Scale = Scale & ~1;
19650 return Scale == 2 || Scale == 4 || Scale == 8;
19651 case MVT::i64:
19652 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19653 // version in Thumb mode.
19654 // r + r
19655 if (Scale == 1)
19656 return true;
19657 // r * 2 (this can be lowered to r + r).
19658 if (!AM.HasBaseReg && Scale == 2)
19659 return true;
19660 return false;
19661 case MVT::isVoid:
19662 // Note, we allow "void" uses (basically, uses that aren't loads or
19663 // stores), because arm allows folding a scale into many arithmetic
19664 // operations. This should be made more precise and revisited later.
19665
19666 // Allow r << imm, but the imm has to be a multiple of two.
19667 if (Scale & 1) return false;
19668 return isPowerOf2_32(Scale);
19669 }
19670}
19671
19673 EVT VT) const {
19674 const int Scale = AM.Scale;
19675
19676 // Negative scales are not supported in Thumb1.
19677 if (Scale < 0)
19678 return false;
19679
19680 // Thumb1 addressing modes do not support register scaling excepting the
19681 // following cases:
19682 // 1. Scale == 1 means no scaling.
19683 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19684 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19685}
19686
19687/// isLegalAddressingMode - Return true if the addressing mode represented
19688/// by AM is legal for this target, for a load/store of the specified type.
19690 const AddrMode &AM, Type *Ty,
19691 unsigned AS, Instruction *I) const {
19692 EVT VT = getValueType(DL, Ty, true);
19693 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19694 return false;
19695
19696 // Can never fold addr of global into load/store.
19697 if (AM.BaseGV)
19698 return false;
19699
19700 switch (AM.Scale) {
19701 case 0: // no scale reg, must be "r+i" or "r", or "i".
19702 break;
19703 default:
19704 // ARM doesn't support any R+R*scale+imm addr modes.
19705 if (AM.BaseOffs)
19706 return false;
19707
19708 if (!VT.isSimple())
19709 return false;
19710
19711 if (Subtarget->isThumb1Only())
19712 return isLegalT1ScaledAddressingMode(AM, VT);
19713
19714 if (Subtarget->isThumb2())
19715 return isLegalT2ScaledAddressingMode(AM, VT);
19716
19717 int Scale = AM.Scale;
19718 switch (VT.getSimpleVT().SimpleTy) {
19719 default: return false;
19720 case MVT::i1:
19721 case MVT::i8:
19722 case MVT::i32:
19723 if (Scale < 0) Scale = -Scale;
19724 if (Scale == 1)
19725 return true;
19726 // r + r << imm
19727 return isPowerOf2_32(Scale & ~1);
19728 case MVT::i16:
19729 case MVT::i64:
19730 // r +/- r
19731 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19732 return true;
19733 // r * 2 (this can be lowered to r + r).
19734 if (!AM.HasBaseReg && Scale == 2)
19735 return true;
19736 return false;
19737
19738 case MVT::isVoid:
19739 // Note, we allow "void" uses (basically, uses that aren't loads or
19740 // stores), because arm allows folding a scale into many arithmetic
19741 // operations. This should be made more precise and revisited later.
19742
19743 // Allow r << imm, but the imm has to be a multiple of two.
19744 if (Scale & 1) return false;
19745 return isPowerOf2_32(Scale);
19746 }
19747 }
19748 return true;
19749}
19750
19751/// isLegalICmpImmediate - Return true if the specified immediate is legal
19752/// icmp immediate, that is the target has icmp instructions which can compare
19753/// a register against the immediate without having to materialize the
19754/// immediate into a register.
19756 // Thumb2 and ARM modes can use cmn for negative immediates.
19757 if (!Subtarget->isThumb())
19758 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19759 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19760 if (Subtarget->isThumb2())
19761 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19762 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19763 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19764 return Imm >= 0 && Imm <= 255;
19765}
19766
19767/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19768/// *or sub* immediate, that is the target has add or sub instructions which can
19769/// add a register with the immediate without having to materialize the
19770/// immediate into a register.
19772 // Same encoding for add/sub, just flip the sign.
19773 int64_t AbsImm = std::abs(Imm);
19774 if (!Subtarget->isThumb())
19775 return ARM_AM::getSOImmVal(AbsImm) != -1;
19776 if (Subtarget->isThumb2())
19777 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19778 // Thumb1 only has 8-bit unsigned immediate.
19779 return AbsImm >= 0 && AbsImm <= 255;
19780}
19781
19782// Return false to prevent folding
19783// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19784// if the folding leads to worse code.
19786 SDValue ConstNode) const {
19787 // Let the DAGCombiner decide for vector types and large types.
19788 const EVT VT = AddNode.getValueType();
19789 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19790 return true;
19791
19792 // It is worse if c0 is legal add immediate, while c1*c0 is not
19793 // and has to be composed by at least two instructions.
19794 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19795 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19796 const int64_t C0 = C0Node->getSExtValue();
19797 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19799 return true;
19800 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19801 return false;
19802
19803 // Default to true and let the DAGCombiner decide.
19804 return true;
19805}
19806
19808 bool isSEXTLoad, SDValue &Base,
19809 SDValue &Offset, bool &isInc,
19810 SelectionDAG &DAG) {
19811 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19812 return false;
19813
19814 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19815 // AddressingMode 3
19816 Base = Ptr->getOperand(0);
19817 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19818 int RHSC = (int)RHS->getZExtValue();
19819 if (RHSC < 0 && RHSC > -256) {
19820 assert(Ptr->getOpcode() == ISD::ADD);
19821 isInc = false;
19822 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19823 return true;
19824 }
19825 }
19826 isInc = (Ptr->getOpcode() == ISD::ADD);
19827 Offset = Ptr->getOperand(1);
19828 return true;
19829 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19830 // AddressingMode 2
19831 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19832 int RHSC = (int)RHS->getZExtValue();
19833 if (RHSC < 0 && RHSC > -0x1000) {
19834 assert(Ptr->getOpcode() == ISD::ADD);
19835 isInc = false;
19836 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19837 Base = Ptr->getOperand(0);
19838 return true;
19839 }
19840 }
19841
19842 if (Ptr->getOpcode() == ISD::ADD) {
19843 isInc = true;
19844 ARM_AM::ShiftOpc ShOpcVal=
19845 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
19846 if (ShOpcVal != ARM_AM::no_shift) {
19847 Base = Ptr->getOperand(1);
19848 Offset = Ptr->getOperand(0);
19849 } else {
19850 Base = Ptr->getOperand(0);
19851 Offset = Ptr->getOperand(1);
19852 }
19853 return true;
19854 }
19855
19856 isInc = (Ptr->getOpcode() == ISD::ADD);
19857 Base = Ptr->getOperand(0);
19858 Offset = Ptr->getOperand(1);
19859 return true;
19860 }
19861
19862 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19863 return false;
19864}
19865
19867 bool isSEXTLoad, SDValue &Base,
19868 SDValue &Offset, bool &isInc,
19869 SelectionDAG &DAG) {
19870 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19871 return false;
19872
19873 Base = Ptr->getOperand(0);
19874 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19875 int RHSC = (int)RHS->getZExtValue();
19876 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19877 assert(Ptr->getOpcode() == ISD::ADD);
19878 isInc = false;
19879 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19880 return true;
19881 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19882 isInc = Ptr->getOpcode() == ISD::ADD;
19883 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19884 return true;
19885 }
19886 }
19887
19888 return false;
19889}
19890
19891static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19892 bool isSEXTLoad, bool IsMasked, bool isLE,
19894 bool &isInc, SelectionDAG &DAG) {
19895 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19896 return false;
19897 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19898 return false;
19899
19900 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19901 // as opposed to a vldrw.32). This can allow extra addressing modes or
19902 // alignments for what is otherwise an equivalent instruction.
19903 bool CanChangeType = isLE && !IsMasked;
19904
19905 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
19906 int RHSC = (int)RHS->getZExtValue();
19907
19908 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19909 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19910 assert(Ptr->getOpcode() == ISD::ADD);
19911 isInc = false;
19912 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19913 return true;
19914 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19915 isInc = Ptr->getOpcode() == ISD::ADD;
19916 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19917 return true;
19918 }
19919 return false;
19920 };
19921
19922 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19923 // (in BE/masked) type.
19924 Base = Ptr->getOperand(0);
19925 if (VT == MVT::v4i16) {
19926 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19927 return true;
19928 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19929 if (IsInRange(RHSC, 0x80, 1))
19930 return true;
19931 } else if (Alignment >= 4 &&
19932 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19933 IsInRange(RHSC, 0x80, 4))
19934 return true;
19935 else if (Alignment >= 2 &&
19936 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19937 IsInRange(RHSC, 0x80, 2))
19938 return true;
19939 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19940 return true;
19941 return false;
19942}
19943
19944/// getPreIndexedAddressParts - returns true by value, base pointer and
19945/// offset pointer and addressing mode by reference if the node's address
19946/// can be legally represented as pre-indexed load / store address.
19947bool
19949 SDValue &Offset,
19951 SelectionDAG &DAG) const {
19952 if (Subtarget->isThumb1Only())
19953 return false;
19954
19955 EVT VT;
19956 SDValue Ptr;
19957 Align Alignment;
19958 bool isSEXTLoad = false;
19959 bool IsMasked = false;
19960 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19961 Ptr = LD->getBasePtr();
19962 VT = LD->getMemoryVT();
19963 Alignment = LD->getAlign();
19964 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19965 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19966 Ptr = ST->getBasePtr();
19967 VT = ST->getMemoryVT();
19968 Alignment = ST->getAlign();
19969 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19970 Ptr = LD->getBasePtr();
19971 VT = LD->getMemoryVT();
19972 Alignment = LD->getAlign();
19973 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19974 IsMasked = true;
19976 Ptr = ST->getBasePtr();
19977 VT = ST->getMemoryVT();
19978 Alignment = ST->getAlign();
19979 IsMasked = true;
19980 } else
19981 return false;
19982
19983 bool isInc;
19984 bool isLegal = false;
19985 if (VT.isVector())
19986 isLegal = Subtarget->hasMVEIntegerOps() &&
19988 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19989 Subtarget->isLittle(), Base, Offset, isInc, DAG);
19990 else {
19991 if (Subtarget->isThumb2())
19992 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19993 Offset, isInc, DAG);
19994 else
19995 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19996 Offset, isInc, DAG);
19997 }
19998 if (!isLegal)
19999 return false;
20000
20001 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
20002 return true;
20003}
20004
20005/// getPostIndexedAddressParts - returns true by value, base pointer and
20006/// offset pointer and addressing mode by reference if this node can be
20007/// combined with a load / store to form a post-indexed load / store.
20009 SDValue &Base,
20010 SDValue &Offset,
20012 SelectionDAG &DAG) const {
20013 EVT VT;
20014 SDValue Ptr;
20015 Align Alignment;
20016 bool isSEXTLoad = false, isNonExt;
20017 bool IsMasked = false;
20018 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
20019 VT = LD->getMemoryVT();
20020 Ptr = LD->getBasePtr();
20021 Alignment = LD->getAlign();
20022 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
20023 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
20024 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
20025 VT = ST->getMemoryVT();
20026 Ptr = ST->getBasePtr();
20027 Alignment = ST->getAlign();
20028 isNonExt = !ST->isTruncatingStore();
20029 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
20030 VT = LD->getMemoryVT();
20031 Ptr = LD->getBasePtr();
20032 Alignment = LD->getAlign();
20033 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
20034 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
20035 IsMasked = true;
20037 VT = ST->getMemoryVT();
20038 Ptr = ST->getBasePtr();
20039 Alignment = ST->getAlign();
20040 isNonExt = !ST->isTruncatingStore();
20041 IsMasked = true;
20042 } else
20043 return false;
20044
20045 if (Subtarget->isThumb1Only()) {
20046 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
20047 // must be non-extending/truncating, i32, with an offset of 4.
20048 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
20049 if (Op->getOpcode() != ISD::ADD || !isNonExt)
20050 return false;
20051 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
20052 if (!RHS || RHS->getZExtValue() != 4)
20053 return false;
20054 if (Alignment < Align(4))
20055 return false;
20056
20057 Offset = Op->getOperand(1);
20058 Base = Op->getOperand(0);
20059 AM = ISD::POST_INC;
20060 return true;
20061 }
20062
20063 bool isInc;
20064 bool isLegal = false;
20065 if (VT.isVector())
20066 isLegal = Subtarget->hasMVEIntegerOps() &&
20067 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
20068 Subtarget->isLittle(), Base, Offset,
20069 isInc, DAG);
20070 else {
20071 if (Subtarget->isThumb2())
20072 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
20073 isInc, DAG);
20074 else
20075 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
20076 isInc, DAG);
20077 }
20078 if (!isLegal)
20079 return false;
20080
20081 if (Ptr != Base) {
20082 // Swap base ptr and offset to catch more post-index load / store when
20083 // it's legal. In Thumb2 mode, offset must be an immediate.
20084 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
20085 !Subtarget->isThumb2())
20087
20088 // Post-indexed load / store update the base pointer.
20089 if (Ptr != Base)
20090 return false;
20091 }
20092
20093 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
20094 return true;
20095}
20096
20098 KnownBits &Known,
20099 const APInt &DemandedElts,
20100 const SelectionDAG &DAG,
20101 unsigned Depth) const {
20102 unsigned BitWidth = Known.getBitWidth();
20103 Known.resetAll();
20104 switch (Op.getOpcode()) {
20105 default: break;
20106 case ARMISD::ADDC:
20107 case ARMISD::ADDE:
20108 case ARMISD::SUBC:
20109 case ARMISD::SUBE:
20110 // Special cases when we convert a carry to a boolean.
20111 if (Op.getResNo() == 0) {
20112 SDValue LHS = Op.getOperand(0);
20113 SDValue RHS = Op.getOperand(1);
20114 // (ADDE 0, 0, C) will give us a single bit.
20115 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
20116 isNullConstant(RHS)) {
20118 return;
20119 }
20120 }
20121 break;
20122 case ARMISD::CMOV: {
20123 // Bits are known zero/one if known on the LHS and RHS.
20124 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
20125 if (Known.isUnknown())
20126 return;
20127
20128 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
20129 Known = Known.intersectWith(KnownRHS);
20130 return;
20131 }
20133 Intrinsic::ID IntID =
20134 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
20135 switch (IntID) {
20136 default: return;
20137 case Intrinsic::arm_ldaex:
20138 case Intrinsic::arm_ldrex: {
20139 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
20140 unsigned MemBits = VT.getScalarSizeInBits();
20141 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
20142 return;
20143 }
20144 }
20145 }
20146 case ARMISD::BFI: {
20147 // Conservatively, we can recurse down the first operand
20148 // and just mask out all affected bits.
20149 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20150
20151 // The operand to BFI is already a mask suitable for removing the bits it
20152 // sets.
20153 const APInt &Mask = Op.getConstantOperandAPInt(2);
20154 Known.Zero &= Mask;
20155 Known.One &= Mask;
20156 return;
20157 }
20158 case ARMISD::VGETLANEs:
20159 case ARMISD::VGETLANEu: {
20160 const SDValue &SrcSV = Op.getOperand(0);
20161 EVT VecVT = SrcSV.getValueType();
20162 assert(VecVT.isVector() && "VGETLANE expected a vector type");
20163 const unsigned NumSrcElts = VecVT.getVectorNumElements();
20164 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
20165 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
20166 "VGETLANE index out of bounds");
20167 unsigned Idx = Pos->getZExtValue();
20168 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
20169 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
20170
20171 EVT VT = Op.getValueType();
20172 const unsigned DstSz = VT.getScalarSizeInBits();
20173 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
20174 (void)SrcSz;
20175 assert(SrcSz == Known.getBitWidth());
20176 assert(DstSz > SrcSz);
20177 if (Op.getOpcode() == ARMISD::VGETLANEs)
20178 Known = Known.sext(DstSz);
20179 else {
20180 Known = Known.zext(DstSz);
20181 }
20182 assert(DstSz == Known.getBitWidth());
20183 break;
20184 }
20185 case ARMISD::VMOVrh: {
20186 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20187 assert(KnownOp.getBitWidth() == 16);
20188 Known = KnownOp.zext(32);
20189 break;
20190 }
20191 case ARMISD::CSINC:
20192 case ARMISD::CSINV:
20193 case ARMISD::CSNEG: {
20194 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20195 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
20196
20197 // The result is either:
20198 // CSINC: KnownOp0 or KnownOp1 + 1
20199 // CSINV: KnownOp0 or ~KnownOp1
20200 // CSNEG: KnownOp0 or KnownOp1 * -1
20201 if (Op.getOpcode() == ARMISD::CSINC)
20202 KnownOp1 =
20203 KnownBits::add(KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
20204 else if (Op.getOpcode() == ARMISD::CSINV)
20205 std::swap(KnownOp1.Zero, KnownOp1.One);
20206 else if (Op.getOpcode() == ARMISD::CSNEG)
20207 KnownOp1 = KnownBits::mul(
20208 KnownOp1, KnownBits::makeConstant(APInt(32, -1)));
20209
20210 Known = KnownOp0.intersectWith(KnownOp1);
20211 break;
20212 }
20213 }
20214}
20215
20217 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
20218 TargetLoweringOpt &TLO) const {
20219 // Delay optimization, so we don't have to deal with illegal types, or block
20220 // optimizations.
20221 if (!TLO.LegalOps)
20222 return false;
20223
20224 // Only optimize AND for now.
20225 if (Op.getOpcode() != ISD::AND)
20226 return false;
20227
20228 EVT VT = Op.getValueType();
20229
20230 // Ignore vectors.
20231 if (VT.isVector())
20232 return false;
20233
20234 assert(VT == MVT::i32 && "Unexpected integer type");
20235
20236 // Make sure the RHS really is a constant.
20237 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20238 if (!C)
20239 return false;
20240
20241 unsigned Mask = C->getZExtValue();
20242
20243 unsigned Demanded = DemandedBits.getZExtValue();
20244 unsigned ShrunkMask = Mask & Demanded;
20245 unsigned ExpandedMask = Mask | ~Demanded;
20246
20247 // If the mask is all zeros, let the target-independent code replace the
20248 // result with zero.
20249 if (ShrunkMask == 0)
20250 return false;
20251
20252 // If the mask is all ones, erase the AND. (Currently, the target-independent
20253 // code won't do this, so we have to do it explicitly to avoid an infinite
20254 // loop in obscure cases.)
20255 if (ExpandedMask == ~0U)
20256 return TLO.CombineTo(Op, Op.getOperand(0));
20257
20258 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
20259 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
20260 };
20261 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
20262 if (NewMask == Mask)
20263 return true;
20264 SDLoc DL(Op);
20265 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
20266 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
20267 return TLO.CombineTo(Op, NewOp);
20268 };
20269
20270 // Prefer uxtb mask.
20271 if (IsLegalMask(0xFF))
20272 return UseMask(0xFF);
20273
20274 // Prefer uxth mask.
20275 if (IsLegalMask(0xFFFF))
20276 return UseMask(0xFFFF);
20277
20278 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
20279 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20280 if (ShrunkMask < 256)
20281 return UseMask(ShrunkMask);
20282
20283 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
20284 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20285 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
20286 return UseMask(ExpandedMask);
20287
20288 // Potential improvements:
20289 //
20290 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
20291 // We could try to prefer Thumb1 immediates which can be lowered to a
20292 // two-instruction sequence.
20293 // We could try to recognize more legal ARM/Thumb2 immediates here.
20294
20295 return false;
20296}
20297
20299 SDValue Op, const APInt &OriginalDemandedBits,
20300 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
20301 unsigned Depth) const {
20302 unsigned Opc = Op.getOpcode();
20303
20304 switch (Opc) {
20305 case ARMISD::ASRL:
20306 case ARMISD::LSRL: {
20307 // If this is result 0 and the other result is unused, see if the demand
20308 // bits allow us to shrink this long shift into a standard small shift in
20309 // the opposite direction.
20310 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
20311 isa<ConstantSDNode>(Op->getOperand(2))) {
20312 unsigned ShAmt = Op->getConstantOperandVal(2);
20313 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
20314 << (32 - ShAmt)))
20315 return TLO.CombineTo(
20316 Op, TLO.DAG.getNode(
20317 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
20318 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
20319 }
20320 break;
20321 }
20322 case ARMISD::VBICIMM: {
20323 SDValue Op0 = Op.getOperand(0);
20324 unsigned ModImm = Op.getConstantOperandVal(1);
20325 unsigned EltBits = 0;
20326 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
20327 if ((OriginalDemandedBits & Mask) == 0)
20328 return TLO.CombineTo(Op, Op0);
20329 }
20330 }
20331
20333 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
20334}
20335
20336//===----------------------------------------------------------------------===//
20337// ARM Inline Assembly Support
20338//===----------------------------------------------------------------------===//
20339
20341 // Looking for "rev" which is V6+.
20342 if (!Subtarget->hasV6Ops())
20343 return false;
20344
20346 StringRef AsmStr = IA->getAsmString();
20347 SmallVector<StringRef, 4> AsmPieces;
20348 SplitString(AsmStr, AsmPieces, ";\n");
20349
20350 switch (AsmPieces.size()) {
20351 default: return false;
20352 case 1:
20353 AsmStr = AsmPieces[0];
20354 AsmPieces.clear();
20355 SplitString(AsmStr, AsmPieces, " \t,");
20356
20357 // rev $0, $1
20358 if (AsmPieces.size() == 3 &&
20359 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
20360 IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
20362 if (Ty && Ty->getBitWidth() == 32)
20364 }
20365 break;
20366 }
20367
20368 return false;
20369}
20370
20371const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
20372 // At this point, we have to lower this constraint to something else, so we
20373 // lower it to an "r" or "w". However, by doing this we will force the result
20374 // to be in register, while the X constraint is much more permissive.
20375 //
20376 // Although we are correct (we are free to emit anything, without
20377 // constraints), we might break use cases that would expect us to be more
20378 // efficient and emit something else.
20379 if (!Subtarget->hasVFP2Base())
20380 return "r";
20381 if (ConstraintVT.isFloatingPoint())
20382 return "w";
20383 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20384 (ConstraintVT.getSizeInBits() == 64 ||
20385 ConstraintVT.getSizeInBits() == 128))
20386 return "w";
20387
20388 return "r";
20389}
20390
20391/// getConstraintType - Given a constraint letter, return the type of
20392/// constraint it is for this target.
20395 unsigned S = Constraint.size();
20396 if (S == 1) {
20397 switch (Constraint[0]) {
20398 default: break;
20399 case 'l': return C_RegisterClass;
20400 case 'w': return C_RegisterClass;
20401 case 'h': return C_RegisterClass;
20402 case 'x': return C_RegisterClass;
20403 case 't': return C_RegisterClass;
20404 case 'j': return C_Immediate; // Constant for movw.
20405 // An address with a single base register. Due to the way we
20406 // currently handle addresses it is the same as an 'r' memory constraint.
20407 case 'Q': return C_Memory;
20408 }
20409 } else if (S == 2) {
20410 switch (Constraint[0]) {
20411 default: break;
20412 case 'T': return C_RegisterClass;
20413 // All 'U+' constraints are addresses.
20414 case 'U': return C_Memory;
20415 }
20416 }
20417 return TargetLowering::getConstraintType(Constraint);
20418}
20419
20420/// Examine constraint type and operand type and determine a weight value.
20421/// This object must already have been set up with the operand type
20422/// and the current alternative constraint selected.
20425 AsmOperandInfo &info, const char *constraint) const {
20427 Value *CallOperandVal = info.CallOperandVal;
20428 // If we don't have a value, we can't do a match,
20429 // but allow it at the lowest weight.
20430 if (!CallOperandVal)
20431 return CW_Default;
20432 Type *type = CallOperandVal->getType();
20433 // Look at the constraint type.
20434 switch (*constraint) {
20435 default:
20437 break;
20438 case 'l':
20439 if (type->isIntegerTy()) {
20440 if (Subtarget->isThumb())
20441 weight = CW_SpecificReg;
20442 else
20443 weight = CW_Register;
20444 }
20445 break;
20446 case 'w':
20447 if (type->isFloatingPointTy())
20448 weight = CW_Register;
20449 break;
20450 }
20451 return weight;
20452}
20453
20454using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20455
20457 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20458 switch (Constraint.size()) {
20459 case 1:
20460 // GCC ARM Constraint Letters
20461 switch (Constraint[0]) {
20462 case 'l': // Low regs or general regs.
20463 if (Subtarget->isThumb())
20464 return RCPair(0U, &ARM::tGPRRegClass);
20465 return RCPair(0U, &ARM::GPRRegClass);
20466 case 'h': // High regs or no regs.
20467 if (Subtarget->isThumb())
20468 return RCPair(0U, &ARM::hGPRRegClass);
20469 break;
20470 case 'r':
20471 if (Subtarget->isThumb1Only())
20472 return RCPair(0U, &ARM::tGPRRegClass);
20473 return RCPair(0U, &ARM::GPRRegClass);
20474 case 'w':
20475 if (VT == MVT::Other)
20476 break;
20477 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20478 return RCPair(0U, &ARM::SPRRegClass);
20479 if (VT.getSizeInBits() == 64)
20480 return RCPair(0U, &ARM::DPRRegClass);
20481 if (VT.getSizeInBits() == 128)
20482 return RCPair(0U, &ARM::QPRRegClass);
20483 break;
20484 case 'x':
20485 if (VT == MVT::Other)
20486 break;
20487 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20488 return RCPair(0U, &ARM::SPR_8RegClass);
20489 if (VT.getSizeInBits() == 64)
20490 return RCPair(0U, &ARM::DPR_8RegClass);
20491 if (VT.getSizeInBits() == 128)
20492 return RCPair(0U, &ARM::QPR_8RegClass);
20493 break;
20494 case 't':
20495 if (VT == MVT::Other)
20496 break;
20497 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20498 return RCPair(0U, &ARM::SPRRegClass);
20499 if (VT.getSizeInBits() == 64)
20500 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20501 if (VT.getSizeInBits() == 128)
20502 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20503 break;
20504 }
20505 break;
20506
20507 case 2:
20508 if (Constraint[0] == 'T') {
20509 switch (Constraint[1]) {
20510 default:
20511 break;
20512 case 'e':
20513 return RCPair(0U, &ARM::tGPREvenRegClass);
20514 case 'o':
20515 return RCPair(0U, &ARM::tGPROddRegClass);
20516 }
20517 }
20518 break;
20519
20520 default:
20521 break;
20522 }
20523
20524 if (StringRef("{cc}").equals_insensitive(Constraint))
20525 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20526
20527 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20528}
20529
20530/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20531/// vector. If it is invalid, don't add anything to Ops.
20533 StringRef Constraint,
20534 std::vector<SDValue> &Ops,
20535 SelectionDAG &DAG) const {
20536 SDValue Result;
20537
20538 // Currently only support length 1 constraints.
20539 if (Constraint.size() != 1)
20540 return;
20541
20542 char ConstraintLetter = Constraint[0];
20543 switch (ConstraintLetter) {
20544 default: break;
20545 case 'j':
20546 case 'I': case 'J': case 'K': case 'L':
20547 case 'M': case 'N': case 'O':
20549 if (!C)
20550 return;
20551
20552 int64_t CVal64 = C->getSExtValue();
20553 int CVal = (int) CVal64;
20554 // None of these constraints allow values larger than 32 bits. Check
20555 // that the value fits in an int.
20556 if (CVal != CVal64)
20557 return;
20558
20559 switch (ConstraintLetter) {
20560 case 'j':
20561 // Constant suitable for movw, must be between 0 and
20562 // 65535.
20563 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20564 if (CVal >= 0 && CVal <= 65535)
20565 break;
20566 return;
20567 case 'I':
20568 if (Subtarget->isThumb1Only()) {
20569 // This must be a constant between 0 and 255, for ADD
20570 // immediates.
20571 if (CVal >= 0 && CVal <= 255)
20572 break;
20573 } else if (Subtarget->isThumb2()) {
20574 // A constant that can be used as an immediate value in a
20575 // data-processing instruction.
20576 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20577 break;
20578 } else {
20579 // A constant that can be used as an immediate value in a
20580 // data-processing instruction.
20581 if (ARM_AM::getSOImmVal(CVal) != -1)
20582 break;
20583 }
20584 return;
20585
20586 case 'J':
20587 if (Subtarget->isThumb1Only()) {
20588 // This must be a constant between -255 and -1, for negated ADD
20589 // immediates. This can be used in GCC with an "n" modifier that
20590 // prints the negated value, for use with SUB instructions. It is
20591 // not useful otherwise but is implemented for compatibility.
20592 if (CVal >= -255 && CVal <= -1)
20593 break;
20594 } else {
20595 // This must be a constant between -4095 and 4095. It is not clear
20596 // what this constraint is intended for. Implemented for
20597 // compatibility with GCC.
20598 if (CVal >= -4095 && CVal <= 4095)
20599 break;
20600 }
20601 return;
20602
20603 case 'K':
20604 if (Subtarget->isThumb1Only()) {
20605 // A 32-bit value where only one byte has a nonzero value. Exclude
20606 // zero to match GCC. This constraint is used by GCC internally for
20607 // constants that can be loaded with a move/shift combination.
20608 // It is not useful otherwise but is implemented for compatibility.
20609 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20610 break;
20611 } else if (Subtarget->isThumb2()) {
20612 // A constant whose bitwise inverse can be used as an immediate
20613 // value in a data-processing instruction. This can be used in GCC
20614 // with a "B" modifier that prints the inverted value, for use with
20615 // BIC and MVN instructions. It is not useful otherwise but is
20616 // implemented for compatibility.
20617 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20618 break;
20619 } else {
20620 // A constant whose bitwise inverse can be used as an immediate
20621 // value in a data-processing instruction. This can be used in GCC
20622 // with a "B" modifier that prints the inverted value, for use with
20623 // BIC and MVN instructions. It is not useful otherwise but is
20624 // implemented for compatibility.
20625 if (ARM_AM::getSOImmVal(~CVal) != -1)
20626 break;
20627 }
20628 return;
20629
20630 case 'L':
20631 if (Subtarget->isThumb1Only()) {
20632 // This must be a constant between -7 and 7,
20633 // for 3-operand ADD/SUB immediate instructions.
20634 if (CVal >= -7 && CVal < 7)
20635 break;
20636 } else if (Subtarget->isThumb2()) {
20637 // A constant whose negation can be used as an immediate value in a
20638 // data-processing instruction. This can be used in GCC with an "n"
20639 // modifier that prints the negated value, for use with SUB
20640 // instructions. It is not useful otherwise but is implemented for
20641 // compatibility.
20642 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20643 break;
20644 } else {
20645 // A constant whose negation can be used as an immediate value in a
20646 // data-processing instruction. This can be used in GCC with an "n"
20647 // modifier that prints the negated value, for use with SUB
20648 // instructions. It is not useful otherwise but is implemented for
20649 // compatibility.
20650 if (ARM_AM::getSOImmVal(-CVal) != -1)
20651 break;
20652 }
20653 return;
20654
20655 case 'M':
20656 if (Subtarget->isThumb1Only()) {
20657 // This must be a multiple of 4 between 0 and 1020, for
20658 // ADD sp + immediate.
20659 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20660 break;
20661 } else {
20662 // A power of two or a constant between 0 and 32. This is used in
20663 // GCC for the shift amount on shifted register operands, but it is
20664 // useful in general for any shift amounts.
20665 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20666 break;
20667 }
20668 return;
20669
20670 case 'N':
20671 if (Subtarget->isThumb1Only()) {
20672 // This must be a constant between 0 and 31, for shift amounts.
20673 if (CVal >= 0 && CVal <= 31)
20674 break;
20675 }
20676 return;
20677
20678 case 'O':
20679 if (Subtarget->isThumb1Only()) {
20680 // This must be a multiple of 4 between -508 and 508, for
20681 // ADD/SUB sp = sp + immediate.
20682 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20683 break;
20684 }
20685 return;
20686 }
20687 Result = DAG.getSignedConstant(CVal, SDLoc(Op), Op.getValueType(),
20688 /*isTarget=*/true);
20689 break;
20690 }
20691
20692 if (Result.getNode()) {
20693 Ops.push_back(Result);
20694 return;
20695 }
20696 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20697}
20698
20700 const SDNode *N, MVT::SimpleValueType SVT) {
20701 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20702 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20703 "Unhandled Opcode in getDivRemLibcall");
20704 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20705 N->getOpcode() == ISD::SREM;
20706 RTLIB::Libcall LC;
20707 switch (SVT) {
20708 default: llvm_unreachable("Unexpected request for libcall!");
20709 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20710 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20711 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20712 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20713 }
20714 return LC;
20715}
20716
20718 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20719 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20720 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20721 "Unhandled Opcode in getDivRemArgList");
20722 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20723 N->getOpcode() == ISD::SREM;
20726 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20727 EVT ArgVT = N->getOperand(i).getValueType();
20728 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20729 Entry.Node = N->getOperand(i);
20730 Entry.Ty = ArgTy;
20731 Entry.IsSExt = isSigned;
20732 Entry.IsZExt = !isSigned;
20733 Args.push_back(Entry);
20734 }
20735 if (Subtarget->isTargetWindows() && Args.size() >= 2)
20736 std::swap(Args[0], Args[1]);
20737 return Args;
20738}
20739
20740SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20741 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20742 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20743 Subtarget->isTargetWindows()) &&
20744 "Register-based DivRem lowering only");
20745 unsigned Opcode = Op->getOpcode();
20746 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20747 "Invalid opcode for Div/Rem lowering");
20748 bool isSigned = (Opcode == ISD::SDIVREM);
20749 EVT VT = Op->getValueType(0);
20750 SDLoc dl(Op);
20751
20752 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20754 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20755 SDValue Res0 =
20756 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20757 SDValue Res1 =
20758 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20759 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20760 {Res0, Res1});
20761 }
20762 }
20763
20764 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20765
20766 // If the target has hardware divide, use divide + multiply + subtract:
20767 // div = a / b
20768 // rem = a - b * div
20769 // return {div, rem}
20770 // This should be lowered into UDIV/SDIV + MLS later on.
20771 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20772 : Subtarget->hasDivideInARMMode();
20773 if (hasDivide && Op->getValueType(0).isSimple() &&
20774 Op->getSimpleValueType(0) == MVT::i32) {
20775 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20776 const SDValue Dividend = Op->getOperand(0);
20777 const SDValue Divisor = Op->getOperand(1);
20778 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20779 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20780 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20781
20782 SDValue Values[2] = {Div, Rem};
20783 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20784 }
20785
20786 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20787 VT.getSimpleVT().SimpleTy);
20788 SDValue InChain = DAG.getEntryNode();
20789
20791 DAG.getContext(),
20792 Subtarget);
20793
20796
20797 Type *RetTy = StructType::get(Ty, Ty);
20798
20799 if (Subtarget->isTargetWindows())
20800 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20801
20803 CLI.setDebugLoc(dl).setChain(InChain)
20804 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
20806
20807 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20808 return CallInfo.first;
20809}
20810
20811// Lowers REM using divmod helpers
20812// see RTABI section 4.2/4.3
20813SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20814 EVT VT = N->getValueType(0);
20815
20816 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20818 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20819 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20820 Result[0], Result[1]);
20821 }
20822
20823 // Build return types (div and rem)
20824 std::vector<Type*> RetTyParams;
20825 Type *RetTyElement;
20826
20827 switch (VT.getSimpleVT().SimpleTy) {
20828 default: llvm_unreachable("Unexpected request for libcall!");
20829 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20830 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20831 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20832 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20833 }
20834
20835 RetTyParams.push_back(RetTyElement);
20836 RetTyParams.push_back(RetTyElement);
20837 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20838 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20839
20840 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20841 SimpleTy);
20842 SDValue InChain = DAG.getEntryNode();
20844 Subtarget);
20845 bool isSigned = N->getOpcode() == ISD::SREM;
20848
20849 if (Subtarget->isTargetWindows())
20850 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20851
20852 // Lower call
20853 CallLoweringInfo CLI(DAG);
20854 CLI.setChain(InChain)
20855 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
20857 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20858
20859 // Return second (rem) result operand (first contains div)
20860 SDNode *ResNode = CallResult.first.getNode();
20861 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20862 return ResNode->getOperand(1);
20863}
20864
20865SDValue
20866ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20867 assert(Subtarget->isTargetWindows() && "unsupported target platform");
20868 SDLoc DL(Op);
20869
20870 // Get the inputs.
20871 SDValue Chain = Op.getOperand(0);
20872 SDValue Size = Op.getOperand(1);
20873
20875 "no-stack-arg-probe")) {
20877 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
20878 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20879 Chain = SP.getValue(1);
20880 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20881 if (Align)
20882 SP =
20883 DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20884 DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32));
20885 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20886 SDValue Ops[2] = { SP, Chain };
20887 return DAG.getMergeValues(Ops, DL);
20888 }
20889
20890 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20891 DAG.getConstant(2, DL, MVT::i32));
20892
20893 SDValue Glue;
20894 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20895 Glue = Chain.getValue(1);
20896
20897 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20898 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
20899
20900 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20901 Chain = NewSP.getValue(1);
20902
20903 SDValue Ops[2] = { NewSP, Chain };
20904 return DAG.getMergeValues(Ops, DL);
20905}
20906
20907SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20908 bool IsStrict = Op->isStrictFPOpcode();
20909 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20910 const unsigned DstSz = Op.getValueType().getSizeInBits();
20911 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
20912 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
20913 "Unexpected type for custom-lowering FP_EXTEND");
20914
20915 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20916 "With both FP DP and 16, any FP conversion is legal!");
20917
20918 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
20919 "With FP16, 16 to 32 conversion is legal!");
20920
20921 // Converting from 32 -> 64 is valid if we have FP64.
20922 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
20923 // FIXME: Remove this when we have strict fp instruction selection patterns
20924 if (IsStrict) {
20925 SDLoc Loc(Op);
20926 SDValue Result = DAG.getNode(ISD::FP_EXTEND,
20927 Loc, Op.getValueType(), SrcVal);
20928 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
20929 }
20930 return Op;
20931 }
20932
20933 // Either we are converting from 16 -> 64, without FP16 and/or
20934 // FP.double-precision or without Armv8-fp. So we must do it in two
20935 // steps.
20936 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
20937 // without FP16. So we must do a function call.
20938 SDLoc Loc(Op);
20939 RTLIB::Libcall LC;
20940 MakeLibCallOptions CallOptions;
20941 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20942 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
20943 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
20944 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
20945 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
20946 if (Supported) {
20947 if (IsStrict) {
20948 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
20949 {DstVT, MVT::Other}, {Chain, SrcVal});
20950 Chain = SrcVal.getValue(1);
20951 } else {
20952 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
20953 }
20954 } else {
20955 LC = RTLIB::getFPEXT(SrcVT, DstVT);
20956 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20957 "Unexpected type for custom-lowering FP_EXTEND");
20958 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20959 Loc, Chain);
20960 }
20961 }
20962
20963 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
20964}
20965
20966SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20967 bool IsStrict = Op->isStrictFPOpcode();
20968
20969 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20970 EVT SrcVT = SrcVal.getValueType();
20971 EVT DstVT = Op.getValueType();
20972 const unsigned DstSz = Op.getValueType().getSizeInBits();
20973 const unsigned SrcSz = SrcVT.getSizeInBits();
20974 (void)DstSz;
20975 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
20976 "Unexpected type for custom-lowering FP_ROUND");
20977
20978 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20979 "With both FP DP and 16, any FP conversion is legal!");
20980
20981 SDLoc Loc(Op);
20982
20983 // Instruction from 32 -> 16 if hasFP16 is valid
20984 if (SrcSz == 32 && Subtarget->hasFP16())
20985 return Op;
20986
20987 // Lib call from 32 -> 16 / 64 -> [32, 16]
20988 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
20989 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20990 "Unexpected type for custom-lowering FP_ROUND");
20991 MakeLibCallOptions CallOptions;
20992 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20994 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20995 Loc, Chain);
20996 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
20997}
20998
20999bool
21001 // The ARM target isn't yet aware of offsets.
21002 return false;
21003}
21004
21006 if (v == 0xffffffff)
21007 return false;
21008
21009 // there can be 1's on either or both "outsides", all the "inside"
21010 // bits must be 0's
21011 return isShiftedMask_32(~v);
21012}
21013
21014/// isFPImmLegal - Returns true if the target can instruction select the
21015/// specified FP immediate natively. If false, the legalizer will
21016/// materialize the FP immediate as a load from a constant pool.
21018 bool ForCodeSize) const {
21019 if (!Subtarget->hasVFP3Base())
21020 return false;
21021 if (VT == MVT::f16 && Subtarget->hasFullFP16())
21022 return ARM_AM::getFP16Imm(Imm) != -1;
21023 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
21024 ARM_AM::getFP32FP16Imm(Imm) != -1)
21025 return true;
21026 if (VT == MVT::f32)
21027 return ARM_AM::getFP32Imm(Imm) != -1;
21028 if (VT == MVT::f64 && Subtarget->hasFP64())
21029 return ARM_AM::getFP64Imm(Imm) != -1;
21030 return false;
21031}
21032
21033/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
21034/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
21035/// specified in the intrinsic calls.
21037 const CallInst &I,
21038 MachineFunction &MF,
21039 unsigned Intrinsic) const {
21040 switch (Intrinsic) {
21041 case Intrinsic::arm_neon_vld1:
21042 case Intrinsic::arm_neon_vld2:
21043 case Intrinsic::arm_neon_vld3:
21044 case Intrinsic::arm_neon_vld4:
21045 case Intrinsic::arm_neon_vld2lane:
21046 case Intrinsic::arm_neon_vld3lane:
21047 case Intrinsic::arm_neon_vld4lane:
21048 case Intrinsic::arm_neon_vld2dup:
21049 case Intrinsic::arm_neon_vld3dup:
21050 case Intrinsic::arm_neon_vld4dup: {
21051 Info.opc = ISD::INTRINSIC_W_CHAIN;
21052 // Conservatively set memVT to the entire set of vectors loaded.
21053 auto &DL = I.getDataLayout();
21054 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
21055 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21056 Info.ptrVal = I.getArgOperand(0);
21057 Info.offset = 0;
21058 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
21059 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
21060 // volatile loads with NEON intrinsics not supported
21061 Info.flags = MachineMemOperand::MOLoad;
21062 return true;
21063 }
21064 case Intrinsic::arm_neon_vld1x2:
21065 case Intrinsic::arm_neon_vld1x3:
21066 case Intrinsic::arm_neon_vld1x4: {
21067 Info.opc = ISD::INTRINSIC_W_CHAIN;
21068 // Conservatively set memVT to the entire set of vectors loaded.
21069 auto &DL = I.getDataLayout();
21070 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
21071 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21072 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
21073 Info.offset = 0;
21074 Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
21075 // volatile loads with NEON intrinsics not supported
21076 Info.flags = MachineMemOperand::MOLoad;
21077 return true;
21078 }
21079 case Intrinsic::arm_neon_vst1:
21080 case Intrinsic::arm_neon_vst2:
21081 case Intrinsic::arm_neon_vst3:
21082 case Intrinsic::arm_neon_vst4:
21083 case Intrinsic::arm_neon_vst2lane:
21084 case Intrinsic::arm_neon_vst3lane:
21085 case Intrinsic::arm_neon_vst4lane: {
21086 Info.opc = ISD::INTRINSIC_VOID;
21087 // Conservatively set memVT to the entire set of vectors stored.
21088 auto &DL = I.getDataLayout();
21089 unsigned NumElts = 0;
21090 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
21091 Type *ArgTy = I.getArgOperand(ArgI)->getType();
21092 if (!ArgTy->isVectorTy())
21093 break;
21094 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
21095 }
21096 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21097 Info.ptrVal = I.getArgOperand(0);
21098 Info.offset = 0;
21099 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
21100 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
21101 // volatile stores with NEON intrinsics not supported
21102 Info.flags = MachineMemOperand::MOStore;
21103 return true;
21104 }
21105 case Intrinsic::arm_neon_vst1x2:
21106 case Intrinsic::arm_neon_vst1x3:
21107 case Intrinsic::arm_neon_vst1x4: {
21108 Info.opc = ISD::INTRINSIC_VOID;
21109 // Conservatively set memVT to the entire set of vectors stored.
21110 auto &DL = I.getDataLayout();
21111 unsigned NumElts = 0;
21112 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
21113 Type *ArgTy = I.getArgOperand(ArgI)->getType();
21114 if (!ArgTy->isVectorTy())
21115 break;
21116 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
21117 }
21118 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21119 Info.ptrVal = I.getArgOperand(0);
21120 Info.offset = 0;
21121 Info.align = I.getParamAlign(0).valueOrOne();
21122 // volatile stores with NEON intrinsics not supported
21123 Info.flags = MachineMemOperand::MOStore;
21124 return true;
21125 }
21126 case Intrinsic::arm_mve_vld2q:
21127 case Intrinsic::arm_mve_vld4q: {
21128 Info.opc = ISD::INTRINSIC_W_CHAIN;
21129 // Conservatively set memVT to the entire set of vectors loaded.
21130 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
21131 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
21132 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21133 Info.ptrVal = I.getArgOperand(0);
21134 Info.offset = 0;
21135 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21136 // volatile loads with MVE intrinsics not supported
21137 Info.flags = MachineMemOperand::MOLoad;
21138 return true;
21139 }
21140 case Intrinsic::arm_mve_vst2q:
21141 case Intrinsic::arm_mve_vst4q: {
21142 Info.opc = ISD::INTRINSIC_VOID;
21143 // Conservatively set memVT to the entire set of vectors stored.
21144 Type *VecTy = I.getArgOperand(1)->getType();
21145 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
21146 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21147 Info.ptrVal = I.getArgOperand(0);
21148 Info.offset = 0;
21149 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21150 // volatile stores with MVE intrinsics not supported
21151 Info.flags = MachineMemOperand::MOStore;
21152 return true;
21153 }
21154 case Intrinsic::arm_mve_vldr_gather_base:
21155 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
21156 Info.opc = ISD::INTRINSIC_W_CHAIN;
21157 Info.ptrVal = nullptr;
21158 Info.memVT = MVT::getVT(I.getType());
21159 Info.align = Align(1);
21160 Info.flags |= MachineMemOperand::MOLoad;
21161 return true;
21162 }
21163 case Intrinsic::arm_mve_vldr_gather_base_wb:
21164 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
21165 Info.opc = ISD::INTRINSIC_W_CHAIN;
21166 Info.ptrVal = nullptr;
21167 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
21168 Info.align = Align(1);
21169 Info.flags |= MachineMemOperand::MOLoad;
21170 return true;
21171 }
21172 case Intrinsic::arm_mve_vldr_gather_offset:
21173 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
21174 Info.opc = ISD::INTRINSIC_W_CHAIN;
21175 Info.ptrVal = nullptr;
21176 MVT DataVT = MVT::getVT(I.getType());
21177 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
21178 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21179 DataVT.getVectorNumElements());
21180 Info.align = Align(1);
21181 Info.flags |= MachineMemOperand::MOLoad;
21182 return true;
21183 }
21184 case Intrinsic::arm_mve_vstr_scatter_base:
21185 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
21186 Info.opc = ISD::INTRINSIC_VOID;
21187 Info.ptrVal = nullptr;
21188 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21189 Info.align = Align(1);
21190 Info.flags |= MachineMemOperand::MOStore;
21191 return true;
21192 }
21193 case Intrinsic::arm_mve_vstr_scatter_base_wb:
21194 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
21195 Info.opc = ISD::INTRINSIC_W_CHAIN;
21196 Info.ptrVal = nullptr;
21197 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21198 Info.align = Align(1);
21199 Info.flags |= MachineMemOperand::MOStore;
21200 return true;
21201 }
21202 case Intrinsic::arm_mve_vstr_scatter_offset:
21203 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
21204 Info.opc = ISD::INTRINSIC_VOID;
21205 Info.ptrVal = nullptr;
21206 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
21207 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
21208 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21209 DataVT.getVectorNumElements());
21210 Info.align = Align(1);
21211 Info.flags |= MachineMemOperand::MOStore;
21212 return true;
21213 }
21214 case Intrinsic::arm_ldaex:
21215 case Intrinsic::arm_ldrex: {
21216 auto &DL = I.getDataLayout();
21217 Type *ValTy = I.getParamElementType(0);
21218 Info.opc = ISD::INTRINSIC_W_CHAIN;
21219 Info.memVT = MVT::getVT(ValTy);
21220 Info.ptrVal = I.getArgOperand(0);
21221 Info.offset = 0;
21222 Info.align = DL.getABITypeAlign(ValTy);
21224 return true;
21225 }
21226 case Intrinsic::arm_stlex:
21227 case Intrinsic::arm_strex: {
21228 auto &DL = I.getDataLayout();
21229 Type *ValTy = I.getParamElementType(1);
21230 Info.opc = ISD::INTRINSIC_W_CHAIN;
21231 Info.memVT = MVT::getVT(ValTy);
21232 Info.ptrVal = I.getArgOperand(1);
21233 Info.offset = 0;
21234 Info.align = DL.getABITypeAlign(ValTy);
21236 return true;
21237 }
21238 case Intrinsic::arm_stlexd:
21239 case Intrinsic::arm_strexd:
21240 Info.opc = ISD::INTRINSIC_W_CHAIN;
21241 Info.memVT = MVT::i64;
21242 Info.ptrVal = I.getArgOperand(2);
21243 Info.offset = 0;
21244 Info.align = Align(8);
21246 return true;
21247
21248 case Intrinsic::arm_ldaexd:
21249 case Intrinsic::arm_ldrexd:
21250 Info.opc = ISD::INTRINSIC_W_CHAIN;
21251 Info.memVT = MVT::i64;
21252 Info.ptrVal = I.getArgOperand(0);
21253 Info.offset = 0;
21254 Info.align = Align(8);
21256 return true;
21257
21258 default:
21259 break;
21260 }
21261
21262 return false;
21263}
21264
21265/// Returns true if it is beneficial to convert a load of a constant
21266/// to just the constant itself.
21268 Type *Ty) const {
21269 assert(Ty->isIntegerTy());
21270
21271 unsigned Bits = Ty->getPrimitiveSizeInBits();
21272 if (Bits == 0 || Bits > 32)
21273 return false;
21274 return true;
21275}
21276
21278 unsigned Index) const {
21280 return false;
21281
21282 return (Index == 0 || Index == ResVT.getVectorNumElements());
21283}
21284
21286 ARM_MB::MemBOpt Domain) const {
21287 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21288
21289 // First, if the target has no DMB, see what fallback we can use.
21290 if (!Subtarget->hasDataBarrier()) {
21291 // Some ARMv6 cpus can support data barriers with an mcr instruction.
21292 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
21293 // here.
21294 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
21295 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
21296 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
21297 Builder.getInt32(0), Builder.getInt32(7),
21298 Builder.getInt32(10), Builder.getInt32(5)};
21299 return Builder.CreateCall(MCR, args);
21300 } else {
21301 // Instead of using barriers, atomic accesses on these subtargets use
21302 // libcalls.
21303 llvm_unreachable("makeDMB on a target so old that it has no barriers");
21304 }
21305 } else {
21306 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
21307 // Only a full system barrier exists in the M-class architectures.
21308 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
21309 Constant *CDomain = Builder.getInt32(Domain);
21310 return Builder.CreateCall(DMB, CDomain);
21311 }
21312}
21313
21314// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
21316 Instruction *Inst,
21317 AtomicOrdering Ord) const {
21318 switch (Ord) {
21321 llvm_unreachable("Invalid fence: unordered/non-atomic");
21324 return nullptr; // Nothing to do
21326 if (!Inst->hasAtomicStore())
21327 return nullptr; // Nothing to do
21328 [[fallthrough]];
21331 if (Subtarget->preferISHSTBarriers())
21332 return makeDMB(Builder, ARM_MB::ISHST);
21333 // FIXME: add a comment with a link to documentation justifying this.
21334 else
21335 return makeDMB(Builder, ARM_MB::ISH);
21336 }
21337 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
21338}
21339
21341 Instruction *Inst,
21342 AtomicOrdering Ord) const {
21343 switch (Ord) {
21346 llvm_unreachable("Invalid fence: unordered/not-atomic");
21349 return nullptr; // Nothing to do
21353 return makeDMB(Builder, ARM_MB::ISH);
21354 }
21355 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
21356}
21357
21358// Loads and stores less than 64-bits are already atomic; ones above that
21359// are doomed anyway, so defer to the default libcall and blame the OS when
21360// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21361// anything for those.
21364 bool has64BitAtomicStore;
21365 if (Subtarget->isMClass())
21366 has64BitAtomicStore = false;
21367 else if (Subtarget->isThumb())
21368 has64BitAtomicStore = Subtarget->hasV7Ops();
21369 else
21370 has64BitAtomicStore = Subtarget->hasV6Ops();
21371
21372 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21373 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21375}
21376
21377// Loads and stores less than 64-bits are already atomic; ones above that
21378// are doomed anyway, so defer to the default libcall and blame the OS when
21379// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21380// anything for those.
21381// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21382// guarantee, see DDI0406C ARM architecture reference manual,
21383// sections A8.8.72-74 LDRD)
21386 bool has64BitAtomicLoad;
21387 if (Subtarget->isMClass())
21388 has64BitAtomicLoad = false;
21389 else if (Subtarget->isThumb())
21390 has64BitAtomicLoad = Subtarget->hasV7Ops();
21391 else
21392 has64BitAtomicLoad = Subtarget->hasV6Ops();
21393
21394 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21395 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21397}
21398
21399// For the real atomic operations, we have ldrex/strex up to 32 bits,
21400// and up to 64 bits on the non-M profiles
21403 if (AI->isFloatingPointOperation())
21405
21406 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21407 bool hasAtomicRMW;
21408 if (Subtarget->isMClass())
21409 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21410 else if (Subtarget->isThumb())
21411 hasAtomicRMW = Subtarget->hasV7Ops();
21412 else
21413 hasAtomicRMW = Subtarget->hasV6Ops();
21414 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21415 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21416 // implement atomicrmw without spilling. If the target address is also on
21417 // the stack and close enough to the spill slot, this can lead to a
21418 // situation where the monitor always gets cleared and the atomic operation
21419 // can never succeed. So at -O0 lower this operation to a CAS loop.
21420 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21423 }
21425}
21426
21427// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21428// bits, and up to 64 bits on the non-M profiles.
21431 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21432 // implement cmpxchg without spilling. If the address being exchanged is also
21433 // on the stack and close enough to the spill slot, this can lead to a
21434 // situation where the monitor always gets cleared and the atomic operation
21435 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21436 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21437 bool HasAtomicCmpXchg;
21438 if (Subtarget->isMClass())
21439 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21440 else if (Subtarget->isThumb())
21441 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21442 else
21443 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21444 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21445 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21448}
21449
21451 const Instruction *I) const {
21452 return InsertFencesForAtomic;
21453}
21454
21456 // ROPI/RWPI are not supported currently.
21457 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21458}
21459
21461 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
21463
21464 // MSVC CRT has a global variable holding security cookie.
21465 M.getOrInsertGlobal("__security_cookie",
21466 PointerType::getUnqual(M.getContext()));
21467
21468 // MSVC CRT has a function to validate security cookie.
21469 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
21470 "__security_check_cookie", Type::getVoidTy(M.getContext()),
21471 PointerType::getUnqual(M.getContext()));
21472 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21473 F->addParamAttr(0, Attribute::AttrKind::InReg);
21474}
21475
21477 // MSVC CRT has a global variable holding security cookie.
21478 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
21479 return M.getGlobalVariable("__security_cookie");
21481}
21482
21484 // MSVC CRT has a function to validate security cookie.
21485 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
21486 return M.getFunction("__security_check_cookie");
21488}
21489
21491 unsigned &Cost) const {
21492 // If we do not have NEON, vector types are not natively supported.
21493 if (!Subtarget->hasNEON())
21494 return false;
21495
21496 // Floating point values and vector values map to the same register file.
21497 // Therefore, although we could do a store extract of a vector type, this is
21498 // better to leave at float as we have more freedom in the addressing mode for
21499 // those.
21500 if (VectorTy->isFPOrFPVectorTy())
21501 return false;
21502
21503 // If the index is unknown at compile time, this is very expensive to lower
21504 // and it is not possible to combine the store with the extract.
21505 if (!isa<ConstantInt>(Idx))
21506 return false;
21507
21508 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21509 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21510 // We can do a store + vector extract on any vector that fits perfectly in a D
21511 // or Q register.
21512 if (BitWidth == 64 || BitWidth == 128) {
21513 Cost = 0;
21514 return true;
21515 }
21516 return false;
21517}
21518
21520 return Subtarget->hasV6T2Ops();
21521}
21522
21524 return Subtarget->hasV6T2Ops();
21525}
21526
21528 const Instruction &AndI) const {
21529 if (!Subtarget->hasV7Ops())
21530 return false;
21531
21532 // Sink the `and` instruction only if the mask would fit into a modified
21533 // immediate operand.
21535 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21536 return false;
21537 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21538 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21539 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21540}
21541
21544 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21545 if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
21548 ExpansionFactor);
21549}
21550
21552 Value *Addr,
21553 AtomicOrdering Ord) const {
21554 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21555 bool IsAcquire = isAcquireOrStronger(Ord);
21556
21557 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21558 // intrinsic must return {i32, i32} and we have to recombine them into a
21559 // single i64 here.
21560 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21562 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21564
21565 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
21566
21567 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21568 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21569 if (!Subtarget->isLittle())
21570 std::swap (Lo, Hi);
21571 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21572 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21573 return Builder.CreateOr(
21574 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21575 }
21576
21577 Type *Tys[] = { Addr->getType() };
21578 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21579 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
21580 CallInst *CI = Builder.CreateCall(Ldrex, Addr);
21581
21582 CI->addParamAttr(
21583 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21584 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21585}
21586
21588 IRBuilderBase &Builder) const {
21589 if (!Subtarget->hasV7Ops())
21590 return;
21591 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21592 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
21593}
21594
21596 Value *Val, Value *Addr,
21597 AtomicOrdering Ord) const {
21598 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21599 bool IsRelease = isReleaseOrStronger(Ord);
21600
21601 // Since the intrinsics must have legal type, the i64 intrinsics take two
21602 // parameters: "i32, i32". We must marshal Val into the appropriate form
21603 // before the call.
21604 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21606 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21608 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21609
21610 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21611 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21612 if (!Subtarget->isLittle())
21613 std::swap(Lo, Hi);
21614 return Builder.CreateCall(Strex, {Lo, Hi, Addr});
21615 }
21616
21617 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21618 Type *Tys[] = { Addr->getType() };
21619 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
21620
21621 CallInst *CI = Builder.CreateCall(
21622 Strex, {Builder.CreateZExtOrBitCast(
21623 Val, Strex->getFunctionType()->getParamType(0)),
21624 Addr});
21625 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21626 Val->getType()));
21627 return CI;
21628}
21629
21630
21632 return Subtarget->isMClass();
21633}
21634
21635/// A helper function for determining the number of interleaved accesses we
21636/// will generate when lowering accesses of the given type.
21637unsigned
21639 const DataLayout &DL) const {
21640 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21641}
21642
21644 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21645 const DataLayout &DL) const {
21646
21647 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21648 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21649
21650 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21651 return false;
21652
21653 // Ensure the vector doesn't have f16 elements. Even though we could do an
21654 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21655 // f32.
21656 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21657 return false;
21658 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21659 return false;
21660
21661 // Ensure the number of vector elements is greater than 1.
21662 if (VecTy->getNumElements() < 2)
21663 return false;
21664
21665 // Ensure the element type is legal.
21666 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21667 return false;
21668 // And the alignment if high enough under MVE.
21669 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21670 return false;
21671
21672 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21673 // 128 will be split into multiple interleaved accesses.
21674 if (Subtarget->hasNEON() && VecSize == 64)
21675 return true;
21676 return VecSize % 128 == 0;
21677}
21678
21680 if (Subtarget->hasNEON())
21681 return 4;
21682 if (Subtarget->hasMVEIntegerOps())
21685}
21686
21687/// Lower an interleaved load into a vldN intrinsic.
21688///
21689/// E.g. Lower an interleaved load (Factor = 2):
21690/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21691/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21692/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21693///
21694/// Into:
21695/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21696/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21697/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21700 ArrayRef<unsigned> Indices, unsigned Factor) const {
21701 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21702 "Invalid interleave factor");
21703 assert(!Shuffles.empty() && "Empty shufflevector input");
21704 assert(Shuffles.size() == Indices.size() &&
21705 "Unmatched number of shufflevectors and indices");
21706
21707 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21708 Type *EltTy = VecTy->getElementType();
21709
21710 const DataLayout &DL = LI->getDataLayout();
21711 Align Alignment = LI->getAlign();
21712
21713 // Skip if we do not have NEON and skip illegal vector types. We can
21714 // "legalize" wide vector types into multiple interleaved accesses as long as
21715 // the vector types are divisible by 128.
21716 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21717 return false;
21718
21719 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21720
21721 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21722 // load integer vectors first and then convert to pointer vectors.
21723 if (EltTy->isPointerTy())
21724 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21725
21726 IRBuilder<> Builder(LI);
21727
21728 // The base address of the load.
21729 Value *BaseAddr = LI->getPointerOperand();
21730
21731 if (NumLoads > 1) {
21732 // If we're going to generate more than one load, reset the sub-vector type
21733 // to something legal.
21734 VecTy = FixedVectorType::get(VecTy->getElementType(),
21735 VecTy->getNumElements() / NumLoads);
21736 }
21737
21738 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21739
21740 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21741 if (Subtarget->hasNEON()) {
21742 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21743 Type *Tys[] = {VecTy, PtrTy};
21744 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21745 Intrinsic::arm_neon_vld3,
21746 Intrinsic::arm_neon_vld4};
21747 Function *VldnFunc =
21748 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
21749
21751 Ops.push_back(BaseAddr);
21752 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21753
21754 return Builder.CreateCall(VldnFunc, Ops, "vldN");
21755 } else {
21756 assert((Factor == 2 || Factor == 4) &&
21757 "expected interleave factor of 2 or 4 for MVE");
21758 Intrinsic::ID LoadInts =
21759 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21760 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21761 Type *Tys[] = {VecTy, PtrTy};
21762 Function *VldnFunc =
21763 Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys);
21764
21766 Ops.push_back(BaseAddr);
21767 return Builder.CreateCall(VldnFunc, Ops, "vldN");
21768 }
21769 };
21770
21771 // Holds sub-vectors extracted from the load intrinsic return values. The
21772 // sub-vectors are associated with the shufflevector instructions they will
21773 // replace.
21775
21776 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21777 // If we're generating more than one load, compute the base address of
21778 // subsequent loads as an offset from the previous.
21779 if (LoadCount > 0)
21780 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21781 VecTy->getNumElements() * Factor);
21782
21783 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21784
21785 // Replace uses of each shufflevector with the corresponding vector loaded
21786 // by ldN.
21787 for (unsigned i = 0; i < Shuffles.size(); i++) {
21788 ShuffleVectorInst *SV = Shuffles[i];
21789 unsigned Index = Indices[i];
21790
21791 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21792
21793 // Convert the integer vector to pointer vector if the element is pointer.
21794 if (EltTy->isPointerTy())
21795 SubVec = Builder.CreateIntToPtr(
21796 SubVec,
21798
21799 SubVecs[SV].push_back(SubVec);
21800 }
21801 }
21802
21803 // Replace uses of the shufflevector instructions with the sub-vectors
21804 // returned by the load intrinsic. If a shufflevector instruction is
21805 // associated with more than one sub-vector, those sub-vectors will be
21806 // concatenated into a single wide vector.
21807 for (ShuffleVectorInst *SVI : Shuffles) {
21808 auto &SubVec = SubVecs[SVI];
21809 auto *WideVec =
21810 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21811 SVI->replaceAllUsesWith(WideVec);
21812 }
21813
21814 return true;
21815}
21816
21817/// Lower an interleaved store into a vstN intrinsic.
21818///
21819/// E.g. Lower an interleaved store (Factor = 3):
21820/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21821/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21822/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21823///
21824/// Into:
21825/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21826/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21827/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21828/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21829///
21830/// Note that the new shufflevectors will be removed and we'll only generate one
21831/// vst3 instruction in CodeGen.
21832///
21833/// Example for a more general valid mask (Factor 3). Lower:
21834/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21835/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21836/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21837///
21838/// Into:
21839/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21840/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21841/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21842/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21844 ShuffleVectorInst *SVI,
21845 unsigned Factor) const {
21846 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21847 "Invalid interleave factor");
21848
21849 auto *VecTy = cast<FixedVectorType>(SVI->getType());
21850 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21851
21852 unsigned LaneLen = VecTy->getNumElements() / Factor;
21853 Type *EltTy = VecTy->getElementType();
21854 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
21855
21856 const DataLayout &DL = SI->getDataLayout();
21857 Align Alignment = SI->getAlign();
21858
21859 // Skip if we do not have NEON and skip illegal vector types. We can
21860 // "legalize" wide vector types into multiple interleaved accesses as long as
21861 // the vector types are divisible by 128.
21862 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
21863 return false;
21864
21865 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
21866
21867 Value *Op0 = SVI->getOperand(0);
21868 Value *Op1 = SVI->getOperand(1);
21869 IRBuilder<> Builder(SI);
21870
21871 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21872 // vectors to integer vectors.
21873 if (EltTy->isPointerTy()) {
21874 Type *IntTy = DL.getIntPtrType(EltTy);
21875
21876 // Convert to the corresponding integer vector.
21877 auto *IntVecTy =
21879 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
21880 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
21881
21882 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
21883 }
21884
21885 // The base address of the store.
21886 Value *BaseAddr = SI->getPointerOperand();
21887
21888 if (NumStores > 1) {
21889 // If we're going to generate more than one store, reset the lane length
21890 // and sub-vector type to something legal.
21891 LaneLen /= NumStores;
21892 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
21893 }
21894
21895 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
21896
21897 auto Mask = SVI->getShuffleMask();
21898
21899 auto createStoreIntrinsic = [&](Value *BaseAddr,
21900 SmallVectorImpl<Value *> &Shuffles) {
21901 if (Subtarget->hasNEON()) {
21902 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
21903 Intrinsic::arm_neon_vst3,
21904 Intrinsic::arm_neon_vst4};
21905 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21906 Type *Tys[] = {PtrTy, SubVecTy};
21907
21909 SI->getModule(), StoreInts[Factor - 2], Tys);
21910
21912 Ops.push_back(BaseAddr);
21913 append_range(Ops, Shuffles);
21914 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
21915 Builder.CreateCall(VstNFunc, Ops);
21916 } else {
21917 assert((Factor == 2 || Factor == 4) &&
21918 "expected interleave factor of 2 or 4 for MVE");
21919 Intrinsic::ID StoreInts =
21920 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
21921 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21922 Type *Tys[] = {PtrTy, SubVecTy};
21923 Function *VstNFunc =
21924 Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys);
21925
21927 Ops.push_back(BaseAddr);
21928 append_range(Ops, Shuffles);
21929 for (unsigned F = 0; F < Factor; F++) {
21930 Ops.push_back(Builder.getInt32(F));
21931 Builder.CreateCall(VstNFunc, Ops);
21932 Ops.pop_back();
21933 }
21934 }
21935 };
21936
21937 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
21938 // If we generating more than one store, we compute the base address of
21939 // subsequent stores as an offset from the previous.
21940 if (StoreCount > 0)
21941 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
21942 BaseAddr, LaneLen * Factor);
21943
21944 SmallVector<Value *, 4> Shuffles;
21945
21946 // Split the shufflevector operands into sub vectors for the new vstN call.
21947 for (unsigned i = 0; i < Factor; i++) {
21948 unsigned IdxI = StoreCount * LaneLen * Factor + i;
21949 if (Mask[IdxI] >= 0) {
21950 Shuffles.push_back(Builder.CreateShuffleVector(
21951 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
21952 } else {
21953 unsigned StartMask = 0;
21954 for (unsigned j = 1; j < LaneLen; j++) {
21955 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
21956 if (Mask[IdxJ * Factor + IdxI] >= 0) {
21957 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
21958 break;
21959 }
21960 }
21961 // Note: If all elements in a chunk are undefs, StartMask=0!
21962 // Note: Filling undef gaps with random elements is ok, since
21963 // those elements were being written anyway (with undefs).
21964 // In the case of all undefs we're defaulting to using elems from 0
21965 // Note: StartMask cannot be negative, it's checked in
21966 // isReInterleaveMask
21967 Shuffles.push_back(Builder.CreateShuffleVector(
21968 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
21969 }
21970 }
21971
21972 createStoreIntrinsic(BaseAddr, Shuffles);
21973 }
21974 return true;
21975}
21976
21984
21986 uint64_t &Members) {
21987 if (auto *ST = dyn_cast<StructType>(Ty)) {
21988 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
21989 uint64_t SubMembers = 0;
21990 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
21991 return false;
21992 Members += SubMembers;
21993 }
21994 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
21995 uint64_t SubMembers = 0;
21996 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
21997 return false;
21998 Members += SubMembers * AT->getNumElements();
21999 } else if (Ty->isFloatTy()) {
22000 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
22001 return false;
22002 Members = 1;
22003 Base = HA_FLOAT;
22004 } else if (Ty->isDoubleTy()) {
22005 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
22006 return false;
22007 Members = 1;
22008 Base = HA_DOUBLE;
22009 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
22010 Members = 1;
22011 switch (Base) {
22012 case HA_FLOAT:
22013 case HA_DOUBLE:
22014 return false;
22015 case HA_VECT64:
22016 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
22017 case HA_VECT128:
22018 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
22019 case HA_UNKNOWN:
22020 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
22021 case 64:
22022 Base = HA_VECT64;
22023 return true;
22024 case 128:
22025 Base = HA_VECT128;
22026 return true;
22027 default:
22028 return false;
22029 }
22030 }
22031 }
22032
22033 return (Members > 0 && Members <= 4);
22034}
22035
22036/// Return the correct alignment for the current calling convention.
22038 Type *ArgTy, const DataLayout &DL) const {
22039 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
22040 if (!ArgTy->isVectorTy())
22041 return ABITypeAlign;
22042
22043 // Avoid over-aligning vector parameters. It would require realigning the
22044 // stack and waste space for no real benefit.
22045 MaybeAlign StackAlign = DL.getStackAlignment();
22046 assert(StackAlign && "data layout string is missing stack alignment");
22047 return std::min(ABITypeAlign, *StackAlign);
22048}
22049
22050/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
22051/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
22052/// passing according to AAPCS rules.
22054 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
22055 const DataLayout &DL) const {
22056 if (getEffectiveCallingConv(CallConv, isVarArg) !=
22058 return false;
22059
22061 uint64_t Members = 0;
22062 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
22063 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
22064
22065 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
22066 return IsHA || IsIntArray;
22067}
22068
22070 const Constant *PersonalityFn) const {
22071 // Platforms which do not use SjLj EH may return values in these registers
22072 // via the personality function.
22073 return Subtarget->useSjLjEH() ? Register() : ARM::R0;
22074}
22075
22077 const Constant *PersonalityFn) const {
22078 // Platforms which do not use SjLj EH may return values in these registers
22079 // via the personality function.
22080 return Subtarget->useSjLjEH() ? Register() : ARM::R1;
22081}
22082
22083void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
22084 // Update IsSplitCSR in ARMFunctionInfo.
22085 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
22086 AFI->setIsSplitCSR(true);
22087}
22088
22089void ARMTargetLowering::insertCopiesSplitCSR(
22090 MachineBasicBlock *Entry,
22091 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
22092 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
22093 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
22094 if (!IStart)
22095 return;
22096
22097 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
22098 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
22100 for (const MCPhysReg *I = IStart; *I; ++I) {
22101 const TargetRegisterClass *RC = nullptr;
22102 if (ARM::GPRRegClass.contains(*I))
22103 RC = &ARM::GPRRegClass;
22104 else if (ARM::DPRRegClass.contains(*I))
22105 RC = &ARM::DPRRegClass;
22106 else
22107 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
22108
22109 Register NewVR = MRI->createVirtualRegister(RC);
22110 // Create copy from CSR to a virtual register.
22111 // FIXME: this currently does not emit CFI pseudo-instructions, it works
22112 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
22113 // nounwind. If we want to generalize this later, we may need to emit
22114 // CFI pseudo-instructions.
22115 assert(Entry->getParent()->getFunction().hasFnAttribute(
22116 Attribute::NoUnwind) &&
22117 "Function should be nounwind in insertCopiesSplitCSR!");
22118 Entry->addLiveIn(*I);
22119 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
22120 .addReg(*I);
22121
22122 // Insert the copy-back instructions right before the terminator.
22123 for (auto *Exit : Exits)
22124 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
22125 TII->get(TargetOpcode::COPY), *I)
22126 .addReg(NewVR);
22127 }
22128}
22129
22134
22136 return Subtarget->hasMVEIntegerOps();
22137}
22138
22141 auto *VTy = dyn_cast<FixedVectorType>(Ty);
22142 if (!VTy)
22143 return false;
22144
22145 auto *ScalarTy = VTy->getScalarType();
22146 unsigned NumElements = VTy->getNumElements();
22147
22148 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
22149 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
22150 return false;
22151
22152 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
22153 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
22154 return Subtarget->hasMVEFloatOps();
22155
22157 return false;
22158
22159 return Subtarget->hasMVEIntegerOps() &&
22160 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
22161 ScalarTy->isIntegerTy(32));
22162}
22163
22166 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
22167 Value *Accumulator) const {
22168
22170
22171 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
22172
22173 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
22174
22175 if (TyWidth > 128) {
22176 int Stride = Ty->getNumElements() / 2;
22177 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
22178 auto SplitSeqVec = llvm::to_vector(SplitSeq);
22179 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
22180 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
22181
22182 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
22183 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
22184 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
22185 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
22186 Value *LowerSplitAcc = nullptr;
22187 Value *UpperSplitAcc = nullptr;
22188
22189 if (Accumulator) {
22190 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
22191 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
22192 }
22193
22194 auto *LowerSplitInt = createComplexDeinterleavingIR(
22195 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
22196 auto *UpperSplitInt = createComplexDeinterleavingIR(
22197 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
22198
22199 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
22200 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
22201 }
22202
22203 auto *IntTy = Type::getInt32Ty(B.getContext());
22204
22205 ConstantInt *ConstRotation = nullptr;
22206 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
22207 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
22208
22209 if (Accumulator)
22210 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
22211 {ConstRotation, Accumulator, InputB, InputA});
22212 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
22213 {ConstRotation, InputB, InputA});
22214 }
22215
22216 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
22217 // 1 means the value is not halved.
22218 auto *ConstHalving = ConstantInt::get(IntTy, 1);
22219
22221 ConstRotation = ConstantInt::get(IntTy, 0);
22223 ConstRotation = ConstantInt::get(IntTy, 1);
22224
22225 if (!ConstRotation)
22226 return nullptr; // Invalid rotation for arm_mve_vcaddq
22227
22228 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
22229 {ConstHalving, ConstRotation, InputA, InputB});
22230 }
22231
22232 return nullptr;
22233}
unsigned const MachineRegisterInfo * MRI
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static EVT getExtensionTo64Bits(const EVT &OrigVT)
return SDValue()
static const MCPhysReg GPRArgRegs[]
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
#define MAKE_CASE(V)
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
static const LLT S1
static const LLT F64
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, const TargetInstrInfo *TII)
MatchingStackOffset - Return true if the given stack call argument is already available in the same p...
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, SDValue V1)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V)
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static const MCPhysReg GPRArgRegs[]
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG)
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
This file a TargetTransformInfo::Concept conforming object specific to the ARM target machine.
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This file implements the BitVector class.
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
#define LLVM_DEBUG(X)
Definition Debug.h:101
This file defines the DenseMap class.
uint64_t Align
uint64_t Addr
std::string Name
uint64_t Size
Symbol * Sym
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
loop Loop Strength Reduction
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
unsigned const TargetRegisterInfo * TRI
unsigned Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
PowerPC Reduce CR logical Operation
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
const MachineOperand & RHS
return LHS getImm()<
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:469
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:166
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * LHS
bool getExactInverse(APFloat *inv) const
Definition APFloat.h:1399
APInt bitcastToAPInt() const
Definition APFloat.h:1266
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition APFloat.h:1241
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:227
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1513
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1642
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1007
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1485
APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:910
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1323
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1194
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:364
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1461
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1104
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1611
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1570
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:624
unsigned logBase2() const
Definition APInt.h:1732
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:468
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1250
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:433
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:299
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:289
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:232
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1535
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:851
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:844
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1628
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1214
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:23
virtual const ARMBaseRegisterInfo & getRegisterInfo() const =0
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
bool isTargetMachO() const
bool isTargetAEABI() const
bool hasARMOps() const
bool supportsTailCall() const
const Triple & getTargetTriple() const
bool hasVFP4Base() const
const ARMBaseInstrInfo * getInstrInfo() const override
bool isThumb1Only() const
bool useFPVFMx() const
bool hasFPARMv8Base() const
bool isThumb2() const
bool isTargetWindows() const
bool isGVIndirectSymbol(const GlobalValue *GV) const
True if the GV will be accessed via an indirect symbol.
bool hasBaseDSP() const
const ARMTargetLowering * getTargetLowering() const override
bool useSjLjEH() const
bool isTargetDarwin() const
const ARMBaseRegisterInfo * getRegisterInfo() const override
bool hasVFP2Base() const
bool isTargetAndroid() const
bool isTargetCOFF() const
bool isTargetGNUAEABI() const
bool hasVFP3Base() const
bool isAPCS_ABI() const
bool useFPVFMx64() const
bool isTargetWatchOS() const
unsigned getPreferBranchLogAlignment() const
bool hasMinSize() const
bool isTargetIOS() const
bool useNEONForSinglePrecisionFP() const
const InstrItineraryData * getInstrItineraryData() const override
getInstrItins - Return the instruction itineraries based on subtarget selection.
bool isTargetWatchABI() const
bool hasAnyDataBarrier() const
bool isTargetDriverKit() const
bool isAAPCS_ABI() const
bool isLittle() const
bool allowsUnalignedMem() const
bool isTargetMuslAEABI() const
bool isTargetLinux() const
bool useFPVFMx16() const
bool isMClass() const
bool isTargetHardFloat() const
bool useMulOps() const
bool isTargetELF() const
Align getDualLoadStoreAlignment() const
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a vstN intrinsic.
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy,Idx).
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a vldN intrinsic.
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
This class represents an incoming formal argument to a Function.
Definition Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
Definition ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
bool isFloatingPointOperation() const
This class holds the attributes for a function, its return value, and its parameters.
Definition Attributes.h:468
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
static BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition BasicBlock.h:61
The address of a basic block.
Definition Constants.h:893
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
unsigned getValNo() const
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getCalledOperand() const
AttributeList getAttributes() const
Return the parameter attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:709
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:271
This is the shared class of boolean and integer constants.
Definition Constants.h:83
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition Constant.h:42
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:195
bool isBigEndian() const
Definition DataLayout.h:196
MaybeAlign getStackAlignment() const
Returns the natural stack alignment, or MaybeAlign() if one wasn't specified.
Definition DataLayout.h:225
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition DataLayout.h:459
Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
StringRef getPrivateGlobalPrefix() const
Definition DataLayout.h:283
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:33
unsigned size() const
Definition DenseMap.h:99
bool empty() const
Definition DenseMap.h:98
iterator begin()
Definition DenseMap.h:75
Diagnostic information for unsupported feature in backend.
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:689
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:216
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:277
const Function & getFunction() const
Definition Function.h:171
arg_iterator arg_begin()
Definition Function.h:865
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:380
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition Function.h:685
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition Function.h:234
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:742
const GlobalValue * getGlobal() const
bool isDSOLocal() const
bool hasExternalWeakLinkage() const
bool hasDLLImportStorageClass() const
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:59
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
TargetInstrInfo overrides.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:91
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2692
std::optional< unsigned > getOperandCycle(unsigned ItinClassIndx, unsigned OperandIdx) const
Return the cycle for the given class and operand.
bool isEmpty() const
Returns true if there are no itineraries.
bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Class to represent integer types.
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
unsigned getSchedClass() const
Return the scheduling class for this instruction.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:41
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
void moveAfter(MachineBasicBlock *NewBefore)
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & reset(Property P)
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
const MachineOperand & getOperand(unsigned i) const
unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:65
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition Pass.cpp:130
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:91
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
void setFlags(SDNodeFlags NewFlags)
static use_iterator use_end()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:132
bool empty() const
Definition SmallSet.h:168
bool erase(const T &V)
Definition SmallSet.h:193
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:181
size_t size() const
Definition SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:51
const unsigned char * bytes_end() const
Definition StringRef.h:130
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:149
const unsigned char * bytes_begin() const
Definition StringRef.h:127
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:370
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC)
Override the default CondCode to be used to test the result of the comparison libcall against zero.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC)
Set the CallingConv that should be used for the specified libcall.
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:44
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition Triple.h:403
bool isOSVersionLT(unsigned Major, unsigned Minor=0, unsigned Micro=0) const
Helper function for doing comparisons against version numbers included in the target triple.
Definition Triple.h:504
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition Triple.h:638
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:257
static IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:251
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:251
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
static Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:235
static IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:249
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:342
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:166
static IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:250
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:199
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:224
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:212
static IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:252
A Use represents the edge between a Value definition and its users.
Definition Use.h:43
const Use & getOperandUse(unsigned i) const
Definition User.h:241
Value * getOperand(unsigned i) const
Definition User.h:228
unsigned getNumOperands() const
Definition User.h:250
LLVM Value Representation.
Definition Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
use_iterator use_begin()
Definition Value.h:360
Base class of all SIMD vector types.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:202
const ParentTy * getParent() const
Definition ilist_node.h:32
self_iterator getIterator()
Definition ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
const unsigned FPReservedBits
const unsigned RoundingBitsPos
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:779
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:243
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:752
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:490
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:153
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:511
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:257
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:573
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:743
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:246
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:813
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:497
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:205
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:157
@ GlobalAddress
Definition ISDOpcodes.h:78
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:840
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:557
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:716
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:262
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:236
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:79
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:804
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:684
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:634
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:751
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:787
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:334
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:756
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:229
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:215
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:330
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:930
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:673
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:734
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:614
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:587
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:124
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:810
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:771
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:338
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:848
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:696
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:765
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:310
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:100
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:457
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:479
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:456
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:886
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:484
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:708
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:190
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:679
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:538
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:52
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:919
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:112
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:905
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:147
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:816
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:793
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:507
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:347
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:691
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:529
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Create or insert an LLVM Function declaration for an intrinsic, and return it.
bool match(Val *V, const Pattern &P)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
auto m_Undef()
Match an arbitrary undef constant.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:54
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition DWP.cpp:480
@ Length
Definition DWP.cpp:480
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1743
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1723
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:255
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2432
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:307
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:267
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2099
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:296
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1542
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition MathExtras.h:279
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:215
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:146
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:340
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:281
bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:291
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:193
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition DAGCombine.h:15
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:260
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1929
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1750
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned gettBLXrOpcode(const MachineFunction &MF)
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:210
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:860
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:760
static constexpr roundingMode rmTowardZero
Definition APFloat.h:258
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:389
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:278
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:294
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:344
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:457
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:367
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:353
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:379
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:464
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:310
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:375
bool isFixedLengthVector() const
Definition ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:317
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:322
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:330
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:302
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:447
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:290
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:62
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:40
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:161
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:70
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:300
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:169
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:333
static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setDiscardResult(bool Value=true)
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...